import os import numpy as np import gradio as gr import assemblyai as aai from translate import Translator import uuid from gtts import gTTS import tempfile from pathlib import Path def voice_to_voice(audio_file): # Transcribe speech transcript = transcribe_audio(audio_file) if transcript.status == aai.TranscriptStatus.error: raise gr.Error(transcript.error) else: transcript = transcript.text # Translate text list_translations = translate_text(transcript) generated_audio_paths = [] # Generate speech from translated text for translation in list_translations: translated_audio_file_name = text_to_speech(translation) path = Path(translated_audio_file_name) generated_audio_paths.append(path) return generated_audio_paths[0], generated_audio_paths[1], generated_audio_paths[2], generated_audio_paths[3], generated_audio_paths[4], generated_audio_paths[5], list_translations[0], list_translations[1], list_translations[2], list_translations[3], list_translations[4], list_translations[5] # Function to transcribe audio using AssemblyAI def transcribe_audio(audio_file): aai.settings.api_key = "21f30361d02543cca65707e8f71721d8" transcriber = aai.Transcriber() transcript = transcriber.transcribe(audio_file) return transcript # Function to translate text def translate_text(text: str) -> str: languages = ["ru", "tr", "sv", "de", "es", "ja"] list_translations = [] for lan in languages: translator = Translator(from_lang="en", to_lang=lan) translation = translator.translate(text) list_translations.append(translation) return list_translations # Function to generate speech with gTTS (Google Text-to-Speech) def text_to_speech(text: str) -> str: # Generate speech using gTTS (Google Text-to-Speech) tts = gTTS(text=text, lang='en', slow=True) # Save the audio to a temporary file with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file: tts.save(tmp_file.name) audio_path = tmp_file.name return audio_path input_audio = gr.Audio( sources=["microphone"], type="filepath", show_download_button=True, waveform_options=gr.WaveformOptions( waveform_color="#01C6FF", waveform_progress_color="#0066B4", skip_length=2, show_controls=False, ), ) with gr.Blocks() as demo: gr.Markdown("## Echo: Voice Translation App") gr.Markdown("## Record yourself in English and immediately receive voice translations.") with gr.Row(): with gr.Column(): audio_input = gr.Audio(sources=["microphone"], type="filepath", show_download_button=True, waveform_options=gr.WaveformOptions( waveform_color="#01C6FF", waveform_progress_color="#0066B4", skip_length=2, show_controls=False, ),) with gr.Row(): submit = gr.Button("Submit", variant="primary") btn = gr.ClearButton(audio_input, "Clear") with gr.Row(): with gr.Group() as turkish: tr_output = gr.Audio(label="Turkish", interactive=False) tr_text = gr.Markdown() with gr.Group() as swedish: sv_output = gr.Audio(label="Swedish", interactive=False) sv_text = gr.Markdown() with gr.Group() as russian: ru_output = gr.Audio(label="Russian", interactive=False) ru_text = gr.Markdown() with gr.Row(): with gr.Group(): de_output = gr.Audio(label="German", interactive=False) de_text = gr.Markdown() with gr.Group(): es_output = gr.Audio(label="Spanish", interactive=False) es_text = gr.Markdown() with gr.Group(): jp_output = gr.Audio(label="Japanese", interactive=False) jp_text = gr.Markdown() output_components = [ru_output, tr_output, sv_output, de_output, es_output, jp_output, ru_text, tr_text, sv_text, de_text, es_text, jp_text] submit.click(fn=voice_to_voice, inputs=audio_input, outputs=output_components, show_progress=True) if __name__ == "__main__": demo.launch()