RO-Rtechs commited on
Commit
0024e0d
·
verified ·
1 Parent(s): c5f1c66

update_gtts for Amharic

Browse files
Files changed (1) hide show
  1. soni_translate/text_to_speech.py +52 -11
soni_translate/text_to_speech.py CHANGED
@@ -963,6 +963,45 @@ def filter_by_speaker(speakers, segments):
963
  }
964
 
965
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
966
  def audio_segmentation_to_voice(
967
  result_diarize,
968
  TRANSLATE_AUDIO_TO,
@@ -1021,6 +1060,7 @@ def audio_segmentation_to_voice(
1021
  pattern_coqui = re.compile(r".+\.(wav|mp3|ogg|m4a)$")
1022
  pattern_vits_onnx = re.compile(r".* VITS-onnx$")
1023
  pattern_openai_tts = re.compile(r".* OpenAI-TTS$")
 
1024
 
1025
  all_segments = result_diarize["segments"]
1026
 
@@ -1028,12 +1068,9 @@ def audio_segmentation_to_voice(
1028
  speakers_bark = find_spkr(pattern_bark, speaker_to_voice, all_segments)
1029
  speakers_vits = find_spkr(pattern_vits, speaker_to_voice, all_segments)
1030
  speakers_coqui = find_spkr(pattern_coqui, speaker_to_voice, all_segments)
1031
- speakers_vits_onnx = find_spkr(
1032
- pattern_vits_onnx, speaker_to_voice, all_segments
1033
- )
1034
- speakers_openai_tts = find_spkr(
1035
- pattern_openai_tts, speaker_to_voice, all_segments
1036
- )
1037
 
1038
  # Filter method in segments
1039
  filtered_edge = filter_by_speaker(speakers_edge, all_segments)
@@ -1042,6 +1079,7 @@ def audio_segmentation_to_voice(
1042
  filtered_coqui = filter_by_speaker(speakers_coqui, all_segments)
1043
  filtered_vits_onnx = filter_by_speaker(speakers_vits_onnx, all_segments)
1044
  filtered_openai_tts = filter_by_speaker(speakers_openai_tts, all_segments)
 
1045
 
1046
  # Infer
1047
  if filtered_edge["segments"]:
@@ -1049,9 +1087,7 @@ def audio_segmentation_to_voice(
1049
  segments_egde_tts(filtered_edge, TRANSLATE_AUDIO_TO, is_gui) # mp3
1050
  if filtered_bark["segments"]:
1051
  logger.info(f"BARK TTS: {speakers_bark}")
1052
- segments_bark_tts(
1053
- filtered_bark, TRANSLATE_AUDIO_TO, model_id_bark
1054
- ) # wav
1055
  if filtered_vits["segments"]:
1056
  logger.info(f"VITS TTS: {speakers_vits}")
1057
  segments_vits_tts(filtered_vits, TRANSLATE_AUDIO_TO) # wav
@@ -1071,6 +1107,9 @@ def audio_segmentation_to_voice(
1071
  if filtered_openai_tts["segments"]:
1072
  logger.info(f"OpenAI TTS: {speakers_openai_tts}")
1073
  segments_openai_tts(filtered_openai_tts, TRANSLATE_AUDIO_TO) # wav
 
 
 
1074
 
1075
  [result.pop("tts_name", None) for result in result_diarize["segments"]]
1076
  return [
@@ -1079,7 +1118,8 @@ def audio_segmentation_to_voice(
1079
  speakers_vits,
1080
  speakers_coqui,
1081
  speakers_vits_onnx,
1082
- speakers_openai_tts
 
1083
  ]
1084
 
1085
 
@@ -1098,7 +1138,8 @@ def accelerate_segments(
1098
  speakers_vits,
1099
  speakers_coqui,
1100
  speakers_vits_onnx,
1101
- speakers_openai_tts
 
1102
  ) = valid_speakers
1103
 
1104
  create_directories(f"{folder_output}/audio/")
 
963
  }
964
 
965
 
966
+ def segments_gtts_tts(filtered_gtts_segments, TRANSLATE_AUDIO_TO):
967
+ """Google TTS implementation"""
968
+ for segment in tqdm(filtered_gtts_segments["segments"]):
969
+ speaker = segment["speaker"]
970
+ text = segment["text"]
971
+ start = segment["start"]
972
+ tts_name = segment["tts_name"]
973
+
974
+ # make the tts audio
975
+ filename = f"audio/{start}.ogg"
976
+ logger.info(f"{text} >> {filename}")
977
+ try:
978
+ # Get language code from TTS name (e.g. "am-GTTS" -> "am")
979
+ lang_code = tts_name.split('-')[0]
980
+ tts = gTTS(text=text, lang=lang_code)
981
+
982
+ # Use temporary file to avoid file system issues
983
+ from tempfile import TemporaryFile
984
+ f = TemporaryFile()
985
+ tts.write_to_fp(f)
986
+ f.seek(0)
987
+
988
+ # Read and save as ogg
989
+ audio_data, samplerate = sf.read(f)
990
+ f.close()
991
+
992
+ data_tts = pad_array(audio_data, samplerate)
993
+ sf.write(
994
+ file=filename,
995
+ samplerate=samplerate,
996
+ data=data_tts,
997
+ format="ogg",
998
+ subtype="vorbis"
999
+ )
1000
+ verify_saved_file_and_size(filename)
1001
+ except Exception as error:
1002
+ error_handling_in_tts(error, segment, TRANSLATE_AUDIO_TO, filename)
1003
+
1004
+
1005
  def audio_segmentation_to_voice(
1006
  result_diarize,
1007
  TRANSLATE_AUDIO_TO,
 
1060
  pattern_coqui = re.compile(r".+\.(wav|mp3|ogg|m4a)$")
1061
  pattern_vits_onnx = re.compile(r".* VITS-onnx$")
1062
  pattern_openai_tts = re.compile(r".* OpenAI-TTS$")
1063
+ pattern_gtts = re.compile(r".*-GTTS$") # New pattern for gTTS
1064
 
1065
  all_segments = result_diarize["segments"]
1066
 
 
1068
  speakers_bark = find_spkr(pattern_bark, speaker_to_voice, all_segments)
1069
  speakers_vits = find_spkr(pattern_vits, speaker_to_voice, all_segments)
1070
  speakers_coqui = find_spkr(pattern_coqui, speaker_to_voice, all_segments)
1071
+ speakers_vits_onnx = find_spkr(pattern_vits_onnx, speaker_to_voice, all_segments)
1072
+ speakers_openai_tts = find_spkr(pattern_openai_tts, speaker_to_voice, all_segments)
1073
+ speakers_gtts = find_spkr(pattern_gtts, speaker_to_voice, all_segments) # New gTTS speakers
 
 
 
1074
 
1075
  # Filter method in segments
1076
  filtered_edge = filter_by_speaker(speakers_edge, all_segments)
 
1079
  filtered_coqui = filter_by_speaker(speakers_coqui, all_segments)
1080
  filtered_vits_onnx = filter_by_speaker(speakers_vits_onnx, all_segments)
1081
  filtered_openai_tts = filter_by_speaker(speakers_openai_tts, all_segments)
1082
+ filtered_gtts = filter_by_speaker(speakers_gtts, all_segments) # New gTTS filter
1083
 
1084
  # Infer
1085
  if filtered_edge["segments"]:
 
1087
  segments_egde_tts(filtered_edge, TRANSLATE_AUDIO_TO, is_gui) # mp3
1088
  if filtered_bark["segments"]:
1089
  logger.info(f"BARK TTS: {speakers_bark}")
1090
+ segments_bark_tts(filtered_bark, TRANSLATE_AUDIO_TO, model_id_bark) # wav
 
 
1091
  if filtered_vits["segments"]:
1092
  logger.info(f"VITS TTS: {speakers_vits}")
1093
  segments_vits_tts(filtered_vits, TRANSLATE_AUDIO_TO) # wav
 
1107
  if filtered_openai_tts["segments"]:
1108
  logger.info(f"OpenAI TTS: {speakers_openai_tts}")
1109
  segments_openai_tts(filtered_openai_tts, TRANSLATE_AUDIO_TO) # wav
1110
+ if filtered_gtts["segments"]: # New gTTS condition
1111
+ logger.info(f"Google TTS: {speakers_gtts}")
1112
+ segments_gtts_tts(filtered_gtts, TRANSLATE_AUDIO_TO) # ogg
1113
 
1114
  [result.pop("tts_name", None) for result in result_diarize["segments"]]
1115
  return [
 
1118
  speakers_vits,
1119
  speakers_coqui,
1120
  speakers_vits_onnx,
1121
+ speakers_openai_tts,
1122
+ speakers_gtts # Add gTTS to returned speakers list
1123
  ]
1124
 
1125
 
 
1138
  speakers_vits,
1139
  speakers_coqui,
1140
  speakers_vits_onnx,
1141
+ speakers_openai_tts,
1142
+ speakers_gtts
1143
  ) = valid_speakers
1144
 
1145
  create_directories(f"{folder_output}/audio/")