Spaces:
Build error
Build error
update_gtts for Amharic
Browse files- soni_translate/text_to_speech.py +52 -11
soni_translate/text_to_speech.py
CHANGED
@@ -963,6 +963,45 @@ def filter_by_speaker(speakers, segments):
|
|
963 |
}
|
964 |
|
965 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
966 |
def audio_segmentation_to_voice(
|
967 |
result_diarize,
|
968 |
TRANSLATE_AUDIO_TO,
|
@@ -1021,6 +1060,7 @@ def audio_segmentation_to_voice(
|
|
1021 |
pattern_coqui = re.compile(r".+\.(wav|mp3|ogg|m4a)$")
|
1022 |
pattern_vits_onnx = re.compile(r".* VITS-onnx$")
|
1023 |
pattern_openai_tts = re.compile(r".* OpenAI-TTS$")
|
|
|
1024 |
|
1025 |
all_segments = result_diarize["segments"]
|
1026 |
|
@@ -1028,12 +1068,9 @@ def audio_segmentation_to_voice(
|
|
1028 |
speakers_bark = find_spkr(pattern_bark, speaker_to_voice, all_segments)
|
1029 |
speakers_vits = find_spkr(pattern_vits, speaker_to_voice, all_segments)
|
1030 |
speakers_coqui = find_spkr(pattern_coqui, speaker_to_voice, all_segments)
|
1031 |
-
speakers_vits_onnx = find_spkr(
|
1032 |
-
|
1033 |
-
)
|
1034 |
-
speakers_openai_tts = find_spkr(
|
1035 |
-
pattern_openai_tts, speaker_to_voice, all_segments
|
1036 |
-
)
|
1037 |
|
1038 |
# Filter method in segments
|
1039 |
filtered_edge = filter_by_speaker(speakers_edge, all_segments)
|
@@ -1042,6 +1079,7 @@ def audio_segmentation_to_voice(
|
|
1042 |
filtered_coqui = filter_by_speaker(speakers_coqui, all_segments)
|
1043 |
filtered_vits_onnx = filter_by_speaker(speakers_vits_onnx, all_segments)
|
1044 |
filtered_openai_tts = filter_by_speaker(speakers_openai_tts, all_segments)
|
|
|
1045 |
|
1046 |
# Infer
|
1047 |
if filtered_edge["segments"]:
|
@@ -1049,9 +1087,7 @@ def audio_segmentation_to_voice(
|
|
1049 |
segments_egde_tts(filtered_edge, TRANSLATE_AUDIO_TO, is_gui) # mp3
|
1050 |
if filtered_bark["segments"]:
|
1051 |
logger.info(f"BARK TTS: {speakers_bark}")
|
1052 |
-
segments_bark_tts(
|
1053 |
-
filtered_bark, TRANSLATE_AUDIO_TO, model_id_bark
|
1054 |
-
) # wav
|
1055 |
if filtered_vits["segments"]:
|
1056 |
logger.info(f"VITS TTS: {speakers_vits}")
|
1057 |
segments_vits_tts(filtered_vits, TRANSLATE_AUDIO_TO) # wav
|
@@ -1071,6 +1107,9 @@ def audio_segmentation_to_voice(
|
|
1071 |
if filtered_openai_tts["segments"]:
|
1072 |
logger.info(f"OpenAI TTS: {speakers_openai_tts}")
|
1073 |
segments_openai_tts(filtered_openai_tts, TRANSLATE_AUDIO_TO) # wav
|
|
|
|
|
|
|
1074 |
|
1075 |
[result.pop("tts_name", None) for result in result_diarize["segments"]]
|
1076 |
return [
|
@@ -1079,7 +1118,8 @@ def audio_segmentation_to_voice(
|
|
1079 |
speakers_vits,
|
1080 |
speakers_coqui,
|
1081 |
speakers_vits_onnx,
|
1082 |
-
speakers_openai_tts
|
|
|
1083 |
]
|
1084 |
|
1085 |
|
@@ -1098,7 +1138,8 @@ def accelerate_segments(
|
|
1098 |
speakers_vits,
|
1099 |
speakers_coqui,
|
1100 |
speakers_vits_onnx,
|
1101 |
-
speakers_openai_tts
|
|
|
1102 |
) = valid_speakers
|
1103 |
|
1104 |
create_directories(f"{folder_output}/audio/")
|
|
|
963 |
}
|
964 |
|
965 |
|
966 |
+
def segments_gtts_tts(filtered_gtts_segments, TRANSLATE_AUDIO_TO):
|
967 |
+
"""Google TTS implementation"""
|
968 |
+
for segment in tqdm(filtered_gtts_segments["segments"]):
|
969 |
+
speaker = segment["speaker"]
|
970 |
+
text = segment["text"]
|
971 |
+
start = segment["start"]
|
972 |
+
tts_name = segment["tts_name"]
|
973 |
+
|
974 |
+
# make the tts audio
|
975 |
+
filename = f"audio/{start}.ogg"
|
976 |
+
logger.info(f"{text} >> {filename}")
|
977 |
+
try:
|
978 |
+
# Get language code from TTS name (e.g. "am-GTTS" -> "am")
|
979 |
+
lang_code = tts_name.split('-')[0]
|
980 |
+
tts = gTTS(text=text, lang=lang_code)
|
981 |
+
|
982 |
+
# Use temporary file to avoid file system issues
|
983 |
+
from tempfile import TemporaryFile
|
984 |
+
f = TemporaryFile()
|
985 |
+
tts.write_to_fp(f)
|
986 |
+
f.seek(0)
|
987 |
+
|
988 |
+
# Read and save as ogg
|
989 |
+
audio_data, samplerate = sf.read(f)
|
990 |
+
f.close()
|
991 |
+
|
992 |
+
data_tts = pad_array(audio_data, samplerate)
|
993 |
+
sf.write(
|
994 |
+
file=filename,
|
995 |
+
samplerate=samplerate,
|
996 |
+
data=data_tts,
|
997 |
+
format="ogg",
|
998 |
+
subtype="vorbis"
|
999 |
+
)
|
1000 |
+
verify_saved_file_and_size(filename)
|
1001 |
+
except Exception as error:
|
1002 |
+
error_handling_in_tts(error, segment, TRANSLATE_AUDIO_TO, filename)
|
1003 |
+
|
1004 |
+
|
1005 |
def audio_segmentation_to_voice(
|
1006 |
result_diarize,
|
1007 |
TRANSLATE_AUDIO_TO,
|
|
|
1060 |
pattern_coqui = re.compile(r".+\.(wav|mp3|ogg|m4a)$")
|
1061 |
pattern_vits_onnx = re.compile(r".* VITS-onnx$")
|
1062 |
pattern_openai_tts = re.compile(r".* OpenAI-TTS$")
|
1063 |
+
pattern_gtts = re.compile(r".*-GTTS$") # New pattern for gTTS
|
1064 |
|
1065 |
all_segments = result_diarize["segments"]
|
1066 |
|
|
|
1068 |
speakers_bark = find_spkr(pattern_bark, speaker_to_voice, all_segments)
|
1069 |
speakers_vits = find_spkr(pattern_vits, speaker_to_voice, all_segments)
|
1070 |
speakers_coqui = find_spkr(pattern_coqui, speaker_to_voice, all_segments)
|
1071 |
+
speakers_vits_onnx = find_spkr(pattern_vits_onnx, speaker_to_voice, all_segments)
|
1072 |
+
speakers_openai_tts = find_spkr(pattern_openai_tts, speaker_to_voice, all_segments)
|
1073 |
+
speakers_gtts = find_spkr(pattern_gtts, speaker_to_voice, all_segments) # New gTTS speakers
|
|
|
|
|
|
|
1074 |
|
1075 |
# Filter method in segments
|
1076 |
filtered_edge = filter_by_speaker(speakers_edge, all_segments)
|
|
|
1079 |
filtered_coqui = filter_by_speaker(speakers_coqui, all_segments)
|
1080 |
filtered_vits_onnx = filter_by_speaker(speakers_vits_onnx, all_segments)
|
1081 |
filtered_openai_tts = filter_by_speaker(speakers_openai_tts, all_segments)
|
1082 |
+
filtered_gtts = filter_by_speaker(speakers_gtts, all_segments) # New gTTS filter
|
1083 |
|
1084 |
# Infer
|
1085 |
if filtered_edge["segments"]:
|
|
|
1087 |
segments_egde_tts(filtered_edge, TRANSLATE_AUDIO_TO, is_gui) # mp3
|
1088 |
if filtered_bark["segments"]:
|
1089 |
logger.info(f"BARK TTS: {speakers_bark}")
|
1090 |
+
segments_bark_tts(filtered_bark, TRANSLATE_AUDIO_TO, model_id_bark) # wav
|
|
|
|
|
1091 |
if filtered_vits["segments"]:
|
1092 |
logger.info(f"VITS TTS: {speakers_vits}")
|
1093 |
segments_vits_tts(filtered_vits, TRANSLATE_AUDIO_TO) # wav
|
|
|
1107 |
if filtered_openai_tts["segments"]:
|
1108 |
logger.info(f"OpenAI TTS: {speakers_openai_tts}")
|
1109 |
segments_openai_tts(filtered_openai_tts, TRANSLATE_AUDIO_TO) # wav
|
1110 |
+
if filtered_gtts["segments"]: # New gTTS condition
|
1111 |
+
logger.info(f"Google TTS: {speakers_gtts}")
|
1112 |
+
segments_gtts_tts(filtered_gtts, TRANSLATE_AUDIO_TO) # ogg
|
1113 |
|
1114 |
[result.pop("tts_name", None) for result in result_diarize["segments"]]
|
1115 |
return [
|
|
|
1118 |
speakers_vits,
|
1119 |
speakers_coqui,
|
1120 |
speakers_vits_onnx,
|
1121 |
+
speakers_openai_tts,
|
1122 |
+
speakers_gtts # Add gTTS to returned speakers list
|
1123 |
]
|
1124 |
|
1125 |
|
|
|
1138 |
speakers_vits,
|
1139 |
speakers_coqui,
|
1140 |
speakers_vits_onnx,
|
1141 |
+
speakers_openai_tts,
|
1142 |
+
speakers_gtts
|
1143 |
) = valid_speakers
|
1144 |
|
1145 |
create_directories(f"{folder_output}/audio/")
|