Jhossain commited on
Commit
d3731c3
·
verified ·
1 Parent(s): d3b67ac

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +53 -397
app.py CHANGED
@@ -8,15 +8,12 @@ import time
8
  import torch
9
  import torchaudio
10
 
11
-
12
  #download for mecab
13
  os.system('python -m unidic download')
14
 
15
  # By using XTTS you agree to CPML license https://coqui.ai/cpml
16
  os.environ["COQUI_TOS_AGREED"] = "1"
17
 
18
- # langid is used to detect language for longer text
19
- # Most users expect text to be their own language, there is checkbox to disable it
20
  import langid
21
  import base64
22
  import csv
@@ -37,18 +34,15 @@ HF_TOKEN = os.environ.get("HF_TOKEN")
37
 
38
  from huggingface_hub import HfApi
39
 
40
- # will use api to restart space on a unrecoverable error
41
  api = HfApi(token=HF_TOKEN)
42
  repo_id = "coqui/xtts"
43
 
44
- # Use never ffmpeg binary for Ubuntu20 to use denoising for microphone input
45
  print("Export newer ffmpeg binary for denoise filter")
46
  ZipFile("ffmpeg.zip").extractall()
47
  print("Make ffmpeg binary executable")
48
  st = os.stat("ffmpeg")
49
  os.chmod("ffmpeg", st.st_mode | stat.S_IEXEC)
50
 
51
- # This will trigger downloading model
52
  print("Downloading if not downloaded Coqui XTTS V2")
53
  from TTS.utils.manage import ModelManager
54
 
@@ -70,7 +64,6 @@ model.load_checkpoint(
70
  )
71
  model.cuda()
72
 
73
- # This is for debugging purposes only
74
  DEVICE_ASSERT_DETECTED = 0
75
  DEVICE_ASSERT_PROMPT = None
76
  DEVICE_ASSERT_LANG = None
@@ -92,43 +85,20 @@ def predict(
92
  gr.Warning(
93
  f"Language you put {language} in is not in is not in our Supported Languages, please choose from dropdown"
94
  )
 
95
 
96
- return (
97
- None,
98
- None,
99
- None,
100
- None,
101
- )
102
-
103
- language_predicted = langid.classify(prompt)[
104
- 0
105
- ].strip() # strip need as there is space at end!
106
-
107
- # tts expects chinese as zh-cn
108
  if language_predicted == "zh":
109
- # we use zh-cn
110
  language_predicted = "zh-cn"
111
 
112
  print(f"Detected language:{language_predicted}, Chosen language:{language}")
113
 
114
- # After text character length 15 trigger language detection
115
  if len(prompt) > 15:
116
- # allow any language for short text as some may be common
117
- # If user unchecks language autodetection it will not trigger
118
- # You may remove this completely for own use
119
  if language_predicted != language and not no_lang_auto_detect:
120
- # Please duplicate and remove this check if you really want this
121
- # Or auto-detector fails to identify language (which it can on pretty short text or mixed text)
122
  gr.Warning(
123
- f"It looks like your text isnt the language you chose , if youre sure the text is the same language you chose, please check disable language auto-detection checkbox"
124
- )
125
-
126
- return (
127
- None,
128
- None,
129
- None,
130
- None,
131
  )
 
132
 
133
  if use_mic == True:
134
  if mic_file_path is not None:
@@ -137,20 +107,10 @@ def predict(
137
  gr.Warning(
138
  "Please record your voice with Microphone, or uncheck Use Microphone to use reference audios"
139
  )
140
- return (
141
- None,
142
- None,
143
- None,
144
- None,
145
- )
146
-
147
  else:
148
  speaker_wav = audio_file_pth
149
 
150
- # Filtering for microphone input, as it has BG noise, maybe silence in beginning and end
151
- # This is fast filtering not perfect
152
-
153
- # Apply all on demand
154
  lowpassfilter = denoise = trim = loudness = True
155
 
156
  if lowpassfilter:
@@ -159,22 +119,14 @@ def predict(
159
  lowpass_highpass = ""
160
 
161
  if trim:
162
- # better to remove silence in beginning and end for microphone
163
  trim_silence = "areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02,areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02,"
164
  else:
165
  trim_silence = ""
166
 
167
  if voice_cleanup:
168
  try:
169
- out_filename = (
170
- speaker_wav + str(uuid.uuid4()) + ".wav"
171
- ) # ffmpeg to know output format
172
-
173
- # we will use newer ffmpeg as that has afftn denoise filter
174
- shell_command = f"./ffmpeg -y -i {speaker_wav} -af {lowpass_highpass}{trim_silence} {out_filename}".split(
175
- " "
176
- )
177
-
178
  command_result = subprocess.run(
179
  [item for item in shell_command],
180
  capture_output=False,
@@ -184,39 +136,26 @@ def predict(
184
  speaker_wav = out_filename
185
  print("Filtered microphone input")
186
  except subprocess.CalledProcessError:
187
- # There was an error - command exited with non-zero code
188
  print("Error: failed filtering, use original microphone input")
189
  else:
190
  speaker_wav = speaker_wav
191
 
192
  if len(prompt) < 2:
193
  gr.Warning("Please give a longer prompt text")
194
- return (
195
- None,
196
- None,
197
- None,
198
- None,
199
- )
200
- if len(prompt) > 200:
201
  gr.Warning(
202
- "Text length limited to 200 characters for this demo, please try shorter text. You can clone this space and edit code for your own usage"
203
- )
204
- return (
205
- None,
206
- None,
207
- None,
208
- None,
209
  )
 
 
210
  global DEVICE_ASSERT_DETECTED
211
  if DEVICE_ASSERT_DETECTED:
212
  global DEVICE_ASSERT_PROMPT
213
  global DEVICE_ASSERT_LANG
214
- # It will likely never come here as we restart space on first unrecoverable error now
215
- print(
216
- f"Unrecoverable exception caused by language:{DEVICE_ASSERT_LANG} prompt:{DEVICE_ASSERT_PROMPT}"
217
- )
218
-
219
- # HF Space specific.. This error is unrecoverable need to restart space
220
  space = api.get_space_runtime(repo_id=repo_id)
221
  if space.stage!="BUILDING":
222
  api.restart_space(repo_id=repo_id)
@@ -227,33 +166,21 @@ def predict(
227
  metrics_text = ""
228
  t_latent = time.time()
229
 
230
- # note diffusion_conditioning not used on hifigan (default mode), it will be empty but need to pass it to model.inference
231
  try:
232
- (
233
- gpt_cond_latent,
234
- speaker_embedding,
235
- ) = model.get_conditioning_latents(audio_path=speaker_wav, gpt_cond_len=30, gpt_cond_chunk_len=4, max_ref_length=60)
 
 
236
  except Exception as e:
237
  print("Speaker encoding error", str(e))
238
- gr.Warning(
239
- "It appears something wrong with reference, did you unmute your microphone?"
240
- )
241
- return (
242
- None,
243
- None,
244
- None,
245
- None,
246
- )
247
 
248
  latent_calculation_time = time.time() - t_latent
249
- # metrics_text=f"Embedding calculation time: {latent_calculation_time:.2f} seconds\n"
250
 
251
- # temporary comma fix
252
- prompt= re.sub("([^\x00-\x7F]|\w)(\.|\。|\?)",r"\1 \2\2",prompt)
253
-
254
- wav_chunks = []
255
- ## Direct mode
256
-
257
  print("I: Generating new audio...")
258
  t0 = time.time()
259
  out = model.inference(
@@ -272,51 +199,9 @@ def predict(
272
  metrics_text+=f"Real-time factor (RTF): {real_time_factor:.2f}\n"
273
  torchaudio.save("output.wav", torch.tensor(out["wav"]).unsqueeze(0), 24000)
274
 
275
-
276
- """
277
- print("I: Generating new audio in streaming mode...")
278
- t0 = time.time()
279
- chunks = model.inference_stream(
280
- prompt,
281
- language,
282
- gpt_cond_latent,
283
- speaker_embedding,
284
- repetition_penalty=7.0,
285
- temperature=0.85,
286
- )
287
-
288
- first_chunk = True
289
- for i, chunk in enumerate(chunks):
290
- if first_chunk:
291
- first_chunk_time = time.time() - t0
292
- metrics_text += f"Latency to first audio chunk: {round(first_chunk_time*1000)} milliseconds\n"
293
- first_chunk = False
294
- wav_chunks.append(chunk)
295
- print(f"Received chunk {i} of audio length {chunk.shape[-1]}")
296
- inference_time = time.time() - t0
297
- print(
298
- f"I: Time to generate audio: {round(inference_time*1000)} milliseconds"
299
- )
300
- #metrics_text += (
301
- # f"Time to generate audio: {round(inference_time*1000)} milliseconds\n"
302
- #)
303
-
304
- wav = torch.cat(wav_chunks, dim=0)
305
- print(wav.shape)
306
- real_time_factor = (time.time() - t0) / wav.shape[0] * 24000
307
- print(f"Real-time factor (RTF): {real_time_factor}")
308
- metrics_text += f"Real-time factor (RTF): {real_time_factor:.2f}\n"
309
-
310
- torchaudio.save("output.wav", wav.squeeze().unsqueeze(0).cpu(), 24000)
311
- """
312
-
313
  except RuntimeError as e:
314
  if "device-side assert" in str(e):
315
- # cannot do anything on cuda device side error, need tor estart
316
- print(
317
- f"Exit due to: Unrecoverable exception caused by language:{language} prompt:{prompt}",
318
- flush=True,
319
- )
320
  gr.Warning("Unhandled Exception encounter, please retry in a minute")
321
  print("Cuda device-assert Runtime encountered need restart")
322
  if not DEVICE_ASSERT_DETECTED:
@@ -324,8 +209,6 @@ def predict(
324
  DEVICE_ASSERT_PROMPT = prompt
325
  DEVICE_ASSERT_LANG = language
326
 
327
- # just before restarting save what caused the issue so we can handle it in future
328
- # Uploading Error data only happens for unrecovarable error
329
  error_time = datetime.datetime.now().strftime("%d-%m-%Y-%H:%M:%S")
330
  error_data = [
331
  error_time,
@@ -355,11 +238,7 @@ def predict(
355
  repo_type="dataset",
356
  )
357
 
358
- # speaker_wav
359
- print("Writing error reference audio")
360
- speaker_filename = (
361
- error_time + "_reference_" + str(uuid.uuid4()) + ".wav"
362
- )
363
  error_api = HfApi()
364
  error_api.upload_file(
365
  path_or_fileobj=speaker_wav,
@@ -368,7 +247,6 @@ def predict(
368
  repo_type="dataset",
369
  )
370
 
371
- # HF Space specific.. This error is unrecoverable need to restart space
372
  space = api.get_space_runtime(repo_id=repo_id)
373
  if space.stage!="BUILDING":
374
  api.restart_space(repo_id=repo_id)
@@ -378,310 +256,92 @@ def predict(
378
  else:
379
  if "Failed to decode" in str(e):
380
  print("Speaker encoding error", str(e))
381
- gr.Warning(
382
- "It appears something wrong with reference, did you unmute your microphone?"
383
- )
384
  else:
385
  print("RuntimeError: non device-side assert error:", str(e))
386
  gr.Warning("Something unexpected happened please retry again.")
387
- return (
388
- None,
389
- None,
390
- None,
391
- None,
392
- )
393
  return (
394
- gr.make_waveform(
395
- audio="output.wav",
396
- ),
397
  "output.wav",
398
  metrics_text,
399
  speaker_wav,
400
  )
401
  else:
402
  gr.Warning("Please accept the Terms & Condition!")
403
- return (
404
- None,
405
- None,
406
- None,
407
- None,
408
- )
409
 
410
-
411
- title = "Coqui🐸 XTTS"
412
 
413
  description = """
414
-
415
  <br/>
416
-
417
- This demo is currently running **XTTS v2.0.3** <a href="https://huggingface.co/coqui/XTTS-v2">XTTS</a> is a multilingual text-to-speech and voice-cloning model. This demo features zero-shot voice cloning, however, you can fine-tune XTTS for better results. Leave a star 🌟 on Github <a href="https://github.com/coqui-ai/TTS">🐸TTS</a>, where our open-source inference and training code lives.
418
-
419
  <br/>
420
-
421
- Supported languages: Arabic: ar, Brazilian Portuguese: pt , Mandarin Chinese: zh-cn, Czech: cs, Dutch: nl, English: en, French: fr, German: de, Italian: it, Polish: pl, Russian: ru, Spanish: es, Turkish: tr, Japanese: ja, Korean: ko, Hungarian: hu, Hindi: hi
422
-
423
  <br/>
424
  """
425
 
426
- links = """
427
- <img referrerpolicy="no-referrer-when-downgrade" src="https://static.scarf.sh/a.png?x-pxid=0d00920c-8cc9-4bf3-90f2-a615797e5f59" />
428
-
429
- | | |
430
- | ------------------------------- | --------------------------------------- |
431
- | 🐸💬 **CoquiTTS** | <a style="display:inline-block" href='https://github.com/coqui-ai/TTS'><img src='https://img.shields.io/github/stars/coqui-ai/TTS?style=social' /></a>|
432
- | 💼 **Documentation** | [ReadTheDocs](https://tts.readthedocs.io/en/latest/)
433
- | 👩‍💻 **Questions** | [GitHub Discussions](https://github.com/coqui-ai/TTS/discussions) |
434
- | 🗯 **Community** | [![Dicord](https://img.shields.io/discord/1037326658807533628?color=%239B59B6&label=chat%20on%20discord)](https://discord.gg/5eXr5seRrv) |
435
-
436
-
437
- """
438
-
439
- article = """
440
- <div style='margin:20px auto;'>
441
- <p>By using this demo you agree to the terms of the Coqui Public Model License at https://coqui.ai/cpml</p>
442
- <p>We collect data only for error cases for improvement.</p>
443
- </div>
444
- """
445
- examples = [
446
- [
447
- "Once when I was six years old I saw a magnificent picture",
448
- "en",
449
- "examples/female.wav",
450
- None,
451
- False,
452
- False,
453
- False,
454
- True,
455
- ],
456
- [
457
- "Lorsque j'avais six ans j'ai vu, une fois, une magnifique image",
458
- "fr",
459
- "examples/male.wav",
460
- None,
461
- False,
462
- False,
463
- False,
464
- True,
465
- ],
466
- [
467
- "Als ich sechs war, sah ich einmal ein wunderbares Bild",
468
- "de",
469
- "examples/female.wav",
470
- None,
471
- False,
472
- False,
473
- False,
474
- True,
475
- ],
476
- [
477
- "Cuando tenía seis años, vi una vez una imagen magnífica",
478
- "es",
479
- "examples/male.wav",
480
- None,
481
- False,
482
- False,
483
- False,
484
- True,
485
- ],
486
- [
487
- "Quando eu tinha seis anos eu vi, uma vez, uma imagem magnífica",
488
- "pt",
489
- "examples/female.wav",
490
- None,
491
- False,
492
- False,
493
- False,
494
- True,
495
- ],
496
- [
497
- "Kiedy miałem sześć lat, zobaczyłem pewnego razu wspaniały obrazek",
498
- "pl",
499
- "examples/male.wav",
500
- None,
501
- False,
502
- False,
503
- False,
504
- True,
505
- ],
506
- [
507
- "Un tempo lontano, quando avevo sei anni, vidi un magnifico disegno",
508
- "it",
509
- "examples/female.wav",
510
- None,
511
- False,
512
- False,
513
- False,
514
- True,
515
- ],
516
- [
517
- "Bir zamanlar, altı yaşındayken, muhteşem bir resim gördüm",
518
- "tr",
519
- "examples/female.wav",
520
- None,
521
- False,
522
- False,
523
- False,
524
- True,
525
- ],
526
- [
527
- "Когда мне было шесть лет, я увидел однажды удивительную картинку",
528
- "ru",
529
- "examples/female.wav",
530
- None,
531
- False,
532
- False,
533
- False,
534
- True,
535
- ],
536
- [
537
- "Toen ik een jaar of zes was, zag ik op een keer een prachtige plaat",
538
- "nl",
539
- "examples/male.wav",
540
- None,
541
- False,
542
- False,
543
- False,
544
- True,
545
- ],
546
- [
547
- "Když mi bylo šest let, viděl jsem jednou nádherný obrázek",
548
- "cs",
549
- "examples/female.wav",
550
- None,
551
- False,
552
- False,
553
- False,
554
- True,
555
- ],
556
- [
557
- "当我还只有六岁的时候, 看到了一副精彩的插画",
558
- "zh-cn",
559
- "examples/female.wav",
560
- None,
561
- False,
562
- False,
563
- False,
564
- True,
565
- ],
566
- [
567
- "かつて 六歳のとき、素晴らしい絵を見ました",
568
- "ja",
569
- "examples/female.wav",
570
- None,
571
- False,
572
- True,
573
- False,
574
- True,
575
- ],
576
- [
577
- "한번은 내가 여섯 살이었을 때 멋진 그림을 보았습니다.",
578
- "ko",
579
- "examples/female.wav",
580
- None,
581
- False,
582
- True,
583
- False,
584
- True,
585
- ],
586
- [
587
- "Egyszer hat éves koromban láttam egy csodálatos képet",
588
- "hu",
589
- "examples/male.wav",
590
- None,
591
- False,
592
- True,
593
- False,
594
- True,
595
- ],
596
- ]
597
-
598
-
599
-
600
  with gr.Blocks(analytics_enabled=False) as demo:
601
  with gr.Row():
602
  with gr.Column():
603
- gr.Markdown(
604
- """
605
- ## <img src="https://raw.githubusercontent.com/coqui-ai/TTS/main/images/coqui-log-green-TTS.png" height="56"/>
606
- """
607
- )
608
  with gr.Column():
609
- # placeholder to align the image
610
  pass
611
 
612
  with gr.Row():
613
  with gr.Column():
614
  gr.Markdown(description)
615
  with gr.Column():
616
- gr.Markdown(links)
 
 
 
 
 
617
 
618
  with gr.Row():
619
  with gr.Column():
620
  input_text_gr = gr.Textbox(
621
  label="Text Prompt",
622
- info="One or two sentences at a time is better. Up to 200 text characters.",
623
  value="Hi there, I'm your new voice clone. Try your best to upload quality audio.",
 
 
624
  )
625
  language_gr = gr.Dropdown(
626
  label="Language",
627
- info="Select an output language for the synthesised speech",
628
- choices=[
629
- "en",
630
- "es",
631
- "fr",
632
- "de",
633
- "it",
634
- "pt",
635
- "pl",
636
- "tr",
637
- "ru",
638
- "nl",
639
- "cs",
640
- "ar",
641
- "zh-cn",
642
- "ja",
643
- "ko",
644
- "hu",
645
- "hi"
646
- ],
647
- max_choices=1,
648
  value="en",
649
  )
650
  ref_gr = gr.Audio(
651
  label="Reference Audio",
652
- info="Click on the ✎ button to upload your own target speaker audio",
653
  type="filepath",
654
  value="examples/female.wav",
655
  )
656
  mic_gr = gr.Audio(
657
  source="microphone",
658
  type="filepath",
659
- info="Use your microphone to record audio",
660
  label="Use Microphone for Reference",
661
  )
662
  use_mic_gr = gr.Checkbox(
663
  label="Use Microphone",
664
  value=False,
665
- info="Notice: Microphone input may not work properly under traffic",
666
  )
667
  clean_ref_gr = gr.Checkbox(
668
  label="Cleanup Reference Voice",
669
  value=False,
670
- info="This check can improve output if your microphone or reference voice is noisy",
671
  )
672
  auto_det_lang_gr = gr.Checkbox(
673
  label="Do not use language auto-detect",
674
  value=False,
675
- info="Check to disable language auto-detection",
676
  )
677
  tos_gr = gr.Checkbox(
678
- label="Agree",
679
  value=False,
680
- info="I agree to the terms of the CPML: https://coqui.ai/cpml",
681
  )
682
-
683
- tts_button = gr.Button("Send", elem_id="send-btn", visible=True)
684
-
685
 
686
  with gr.Column():
687
  video_gr = gr.Video(label="Waveform Visual")
@@ -689,15 +349,11 @@ with gr.Blocks(analytics_enabled=False) as demo:
689
  out_text_gr = gr.Text(label="Metrics")
690
  ref_audio_gr = gr.Audio(label="Reference Audio Used")
691
 
692
- with gr.Row():
693
- gr.Examples(examples,
694
- label="Examples",
695
- inputs=[input_text_gr, language_gr, ref_gr, mic_gr, use_mic_gr, clean_ref_gr, auto_det_lang_gr, tos_gr],
696
- outputs=[video_gr, audio_gr, out_text_gr, ref_audio_gr],
697
- fn=predict,
698
- cache_examples=False,)
699
-
700
- tts_button.click(predict, [input_text_gr, language_gr, ref_gr, mic_gr, use_mic_gr, clean_ref_gr, auto_det_lang_gr, tos_gr], outputs=[video_gr, audio_gr, out_text_gr, ref_audio_gr])
701
 
702
  demo.queue()
703
  demo.launch(debug=True, show_api=True)
 
8
  import torch
9
  import torchaudio
10
 
 
11
  #download for mecab
12
  os.system('python -m unidic download')
13
 
14
  # By using XTTS you agree to CPML license https://coqui.ai/cpml
15
  os.environ["COQUI_TOS_AGREED"] = "1"
16
 
 
 
17
  import langid
18
  import base64
19
  import csv
 
34
 
35
  from huggingface_hub import HfApi
36
 
 
37
  api = HfApi(token=HF_TOKEN)
38
  repo_id = "coqui/xtts"
39
 
 
40
  print("Export newer ffmpeg binary for denoise filter")
41
  ZipFile("ffmpeg.zip").extractall()
42
  print("Make ffmpeg binary executable")
43
  st = os.stat("ffmpeg")
44
  os.chmod("ffmpeg", st.st_mode | stat.S_IEXEC)
45
 
 
46
  print("Downloading if not downloaded Coqui XTTS V2")
47
  from TTS.utils.manage import ModelManager
48
 
 
64
  )
65
  model.cuda()
66
 
 
67
  DEVICE_ASSERT_DETECTED = 0
68
  DEVICE_ASSERT_PROMPT = None
69
  DEVICE_ASSERT_LANG = None
 
85
  gr.Warning(
86
  f"Language you put {language} in is not in is not in our Supported Languages, please choose from dropdown"
87
  )
88
+ return (None, None, None, None)
89
 
90
+ language_predicted = langid.classify(prompt)[0].strip()
 
 
 
 
 
 
 
 
 
 
 
91
  if language_predicted == "zh":
 
92
  language_predicted = "zh-cn"
93
 
94
  print(f"Detected language:{language_predicted}, Chosen language:{language}")
95
 
 
96
  if len(prompt) > 15:
 
 
 
97
  if language_predicted != language and not no_lang_auto_detect:
 
 
98
  gr.Warning(
99
+ f"It looks like your text isn't the language you chose, if you're sure the text is the same language you chose, please check disable language auto-detection checkbox"
 
 
 
 
 
 
 
100
  )
101
+ return (None, None, None, None)
102
 
103
  if use_mic == True:
104
  if mic_file_path is not None:
 
107
  gr.Warning(
108
  "Please record your voice with Microphone, or uncheck Use Microphone to use reference audios"
109
  )
110
+ return (None, None, None, None)
 
 
 
 
 
 
111
  else:
112
  speaker_wav = audio_file_pth
113
 
 
 
 
 
114
  lowpassfilter = denoise = trim = loudness = True
115
 
116
  if lowpassfilter:
 
119
  lowpass_highpass = ""
120
 
121
  if trim:
 
122
  trim_silence = "areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02,areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02,"
123
  else:
124
  trim_silence = ""
125
 
126
  if voice_cleanup:
127
  try:
128
+ out_filename = speaker_wav + str(uuid.uuid4()) + ".wav"
129
+ shell_command = f"./ffmpeg -y -i {speaker_wav} -af {lowpass_highpass}{trim_silence} {out_filename}".split(" ")
 
 
 
 
 
 
 
130
  command_result = subprocess.run(
131
  [item for item in shell_command],
132
  capture_output=False,
 
136
  speaker_wav = out_filename
137
  print("Filtered microphone input")
138
  except subprocess.CalledProcessError:
 
139
  print("Error: failed filtering, use original microphone input")
140
  else:
141
  speaker_wav = speaker_wav
142
 
143
  if len(prompt) < 2:
144
  gr.Warning("Please give a longer prompt text")
145
+ return (None, None, None, None)
146
+
147
+ # Changed from 200 to 5000 characters
148
+ if len(prompt) > 5000:
 
 
 
149
  gr.Warning(
150
+ "Text length limited to 5000 characters for this demo"
 
 
 
 
 
 
151
  )
152
+ return (None, None, None, None)
153
+
154
  global DEVICE_ASSERT_DETECTED
155
  if DEVICE_ASSERT_DETECTED:
156
  global DEVICE_ASSERT_PROMPT
157
  global DEVICE_ASSERT_LANG
158
+ print(f"Unrecoverable exception caused by language:{DEVICE_ASSERT_LANG} prompt:{DEVICE_ASSERT_PROMPT}")
 
 
 
 
 
159
  space = api.get_space_runtime(repo_id=repo_id)
160
  if space.stage!="BUILDING":
161
  api.restart_space(repo_id=repo_id)
 
166
  metrics_text = ""
167
  t_latent = time.time()
168
 
 
169
  try:
170
+ (gpt_cond_latent, speaker_embedding) = model.get_conditioning_latents(
171
+ audio_path=speaker_wav,
172
+ gpt_cond_len=30,
173
+ gpt_cond_chunk_len=4,
174
+ max_ref_length=60
175
+ )
176
  except Exception as e:
177
  print("Speaker encoding error", str(e))
178
+ gr.Warning("It appears something wrong with reference, did you unmute your microphone?")
179
+ return (None, None, None, None)
 
 
 
 
 
 
 
180
 
181
  latent_calculation_time = time.time() - t_latent
182
+ prompt = re.sub("([^\x00-\x7F]|\w)(\.|\。|\?)",r"\1 \2\2",prompt)
183
 
 
 
 
 
 
 
184
  print("I: Generating new audio...")
185
  t0 = time.time()
186
  out = model.inference(
 
199
  metrics_text+=f"Real-time factor (RTF): {real_time_factor:.2f}\n"
200
  torchaudio.save("output.wav", torch.tensor(out["wav"]).unsqueeze(0), 24000)
201
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
202
  except RuntimeError as e:
203
  if "device-side assert" in str(e):
204
+ print(f"Exit due to: Unrecoverable exception caused by language:{language} prompt:{prompt}", flush=True)
 
 
 
 
205
  gr.Warning("Unhandled Exception encounter, please retry in a minute")
206
  print("Cuda device-assert Runtime encountered need restart")
207
  if not DEVICE_ASSERT_DETECTED:
 
209
  DEVICE_ASSERT_PROMPT = prompt
210
  DEVICE_ASSERT_LANG = language
211
 
 
 
212
  error_time = datetime.datetime.now().strftime("%d-%m-%Y-%H:%M:%S")
213
  error_data = [
214
  error_time,
 
238
  repo_type="dataset",
239
  )
240
 
241
+ speaker_filename = error_time + "_reference_" + str(uuid.uuid4()) + ".wav"
 
 
 
 
242
  error_api = HfApi()
243
  error_api.upload_file(
244
  path_or_fileobj=speaker_wav,
 
247
  repo_type="dataset",
248
  )
249
 
 
250
  space = api.get_space_runtime(repo_id=repo_id)
251
  if space.stage!="BUILDING":
252
  api.restart_space(repo_id=repo_id)
 
256
  else:
257
  if "Failed to decode" in str(e):
258
  print("Speaker encoding error", str(e))
259
+ gr.Warning("It appears something wrong with reference, did you unmute your microphone?")
 
 
260
  else:
261
  print("RuntimeError: non device-side assert error:", str(e))
262
  gr.Warning("Something unexpected happened please retry again.")
263
+ return (None, None, None, None)
 
 
 
 
 
264
  return (
265
+ gr.make_waveform(audio="output.wav"),
 
 
266
  "output.wav",
267
  metrics_text,
268
  speaker_wav,
269
  )
270
  else:
271
  gr.Warning("Please accept the Terms & Condition!")
272
+ return (None, None, None, None)
 
 
 
 
 
273
 
274
+ title = "Coqui🐸 XTTS (5000 Char Limit)"
 
275
 
276
  description = """
 
277
  <br/>
278
+ This demo is running **XTTS v2.0.3** with 5000 character limit. <a href="https://huggingface.co/coqui/XTTS-v2">XTTS</a> is a multilingual text-to-speech model with voice cloning.
 
 
279
  <br/>
280
+ Supported languages: Arabic (ar), Portuguese (pt), Chinese (zh-cn), Czech (cs), Dutch (nl), English (en), French (fr), German (de), Italian (it), Polish (pl), Russian (ru), Spanish (es), Turkish (tr), Japanese (ja), Korean (ko), Hungarian (hu), Hindi (hi)
 
 
281
  <br/>
282
  """
283
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
284
  with gr.Blocks(analytics_enabled=False) as demo:
285
  with gr.Row():
286
  with gr.Column():
287
+ gr.Markdown("""
288
+ ## <img src="https://raw.githubusercontent.com/coqui-ai/TTS/main/images/coqui-log-green-TTS.png" height="56"/>
289
+ """)
 
 
290
  with gr.Column():
 
291
  pass
292
 
293
  with gr.Row():
294
  with gr.Column():
295
  gr.Markdown(description)
296
  with gr.Column():
297
+ gr.Markdown("""
298
+ | | |
299
+ | ------------------------------- | --------------------------------------- |
300
+ | 🐸💬 **CoquiTTS** | <a style="display:inline-block" href='https://github.com/coqui-ai/TTS'><img src='https://img.shields.io/github/stars/coqui-ai/TTS?style=social' /></a>|
301
+ | 💼 **Documentation** | [ReadTheDocs](https://tts.readthedocs.io/en/latest/) |
302
+ """)
303
 
304
  with gr.Row():
305
  with gr.Column():
306
  input_text_gr = gr.Textbox(
307
  label="Text Prompt",
308
+ info="Up to 5000 text characters.",
309
  value="Hi there, I'm your new voice clone. Try your best to upload quality audio.",
310
+ lines=5,
311
+ max_lines=10
312
  )
313
  language_gr = gr.Dropdown(
314
  label="Language",
315
+ choices=["en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn", "ja", "ko", "hu", "hi"],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
316
  value="en",
317
  )
318
  ref_gr = gr.Audio(
319
  label="Reference Audio",
 
320
  type="filepath",
321
  value="examples/female.wav",
322
  )
323
  mic_gr = gr.Audio(
324
  source="microphone",
325
  type="filepath",
 
326
  label="Use Microphone for Reference",
327
  )
328
  use_mic_gr = gr.Checkbox(
329
  label="Use Microphone",
330
  value=False,
 
331
  )
332
  clean_ref_gr = gr.Checkbox(
333
  label="Cleanup Reference Voice",
334
  value=False,
 
335
  )
336
  auto_det_lang_gr = gr.Checkbox(
337
  label="Do not use language auto-detect",
338
  value=False,
 
339
  )
340
  tos_gr = gr.Checkbox(
341
+ label="Agree to CPML terms",
342
  value=False,
 
343
  )
344
+ tts_button = gr.Button("Generate Speech", elem_id="send-btn", visible=True)
 
 
345
 
346
  with gr.Column():
347
  video_gr = gr.Video(label="Waveform Visual")
 
349
  out_text_gr = gr.Text(label="Metrics")
350
  ref_audio_gr = gr.Audio(label="Reference Audio Used")
351
 
352
+ tts_button.click(
353
+ predict,
354
+ [input_text_gr, language_gr, ref_gr, mic_gr, use_mic_gr, clean_ref_gr, auto_det_lang_gr, tos_gr],
355
+ outputs=[video_gr, audio_gr, out_text_gr, ref_audio_gr]
356
+ )
 
 
 
 
357
 
358
  demo.queue()
359
  demo.launch(debug=True, show_api=True)