# -*- coding: utf-8 -*- """TTS with Adjustable Voice Parameters.ipynb Automatically generated by Colab. Original file is located at https://colab.research.google.com/drive/1qFHOV3WJLOgPWOAj4v864Wy3ZKZsut35 """ # This file contains the full code for the TTS demo with adjustable voice parameters. # This cell installs all necessary Python libraries for the project. # It includes PyTorch for deep learning, Transformers for NLP, Gradio for UI, TTS for speech, and NumPy for numerical operations. # NOTE: 'pip install' commands should be in requirements.txt, not directly in the script for deployment. # !pip install torch torchvision torchaudio --quiet # !pip install transformers==4.30.2 # !pip install gradio --quiet # !pip install TTS --quiet # !pip install numpy # !pip install soundfile # <<< REMOVE THIS LINE: it should be in requirements.txt # This cell imports the required libraries for the project. # It brings in PyTorch for GPU operations, TTS for text-to-speech functionality, and Gradio for creating the web interface. import torch from TTS.api import TTS import gradio as gr import soundfile as sf # <<< MOVED: Import soundfile here once at the top import matplotlib.pyplot as plt # <<< MOVED: Import matplotlib here once at the top import numpy as np # <<< MOVED: Import numpy here once at the top # This cell initializes the TTS object and lists all available pre-trained models. # It helps to see which Text-to-Speech models can be used later. # Note: This part is for exploration. For deployment, you usually load a specific model directly. # tts = TTS() # available_models = tts.list_models() # print(available_models) # <<< For deployment, this print might not be needed. # This cell selects a specific Text-to-Speech model and loads it for use. # It prepares the chosen model (here, 'tacotron2-DDC') for generating speech from text. model_name = "tts_models/en/ljspeech/tacotron2-DDC" tts = TTS(model_name) # This cell defines a function to visualize an audio waveform. # It plots the amplitude against time and saves the plot as a PNG image. # NOTE: This function is defined, but the Gradio function below duplicates its plotting logic. # It's good to keep it if you use it separately, otherwise, consider consolidating. def plot_waveform(wav): plt.figure(figsize=(10, 2)) plt.plot(np.linspace(0, len(wav)/24000, num=len(wav)), wav) plt.title("Waveform") plt.xlabel("Time (s)") plt.ylabel("Amplitude") plt.tight_layout() plot_path = "/content/waveform.png" plt.savefig(plot_path) plt.close() # <<< Added: Close the plot to free memory return plot_path import tempfile # This cell defines a Gradio-compatible function to synthesize speech from text. # It generates an audio file and a corresponding waveform plot, handling potential errors. def synthesize_speech_gr(text, speed=1.0, pitch=1.0, volume=1.0): try: wav = tts.tts(text) with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_audio_file: output_path = tmp_audio_file.name sf.write(output_path, wav, 24000) # Ensure sample rate matches model's output or is appropriate. with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_image_file: waveform_path = tmp_image_file.name plt.figure(figsize=(10, 2)) plt.plot(np.linspace(0, len(wav)/24000, len(wav)), wav, color='orange') plt.xlabel("Time (s)") plt.ylabel("Amplitude") plt.title("Waveform") plt.tight_layout() plt.savefig(waveform_path) plt.close() # Important: Close the plot to free up memory after saving the image. return output_path, waveform_path except Exception as e: print(f"❌ Gradio Error: {e}") return None, None # This cell defines a function for Gradio to convert text to speech. # It uses the loaded TTS model to synthesize audio and saves it to a WAV file. # <<< RECOMMENDATION: This function 'tts_gradio' seems redundant if 'synthesize_speech_gr' is used for the Gradio interface. # Consider removing it unless there's a specific reason to keep both. # If you keep it, ensure it's called somewhere, otherwise it's dead code. def tts_gradio(text): try: wav = tts.tts(text) sf.write("output.wav", wav, 22050) # Saves the generated audio at a sample rate of 22050 Hz. return "output.wav" except Exception as e: print("❌ Error in TTS generation:", e) return None # This cell calls the speech synthesis function with a test phrase and prints the output file path. # It's used to quickly verify that the speech generation is working correctly during development. # <<< IMPORTANT: The function called here is 'synthesize_speech', but the one defined is 'synthesize_speech_gr'. # You need to correct the function name here. # Also, for deployment, this test call might not be needed or should be placed under an 'if __name__ == "__main__":' block. # output_path, waveform_path = synthesize_speech_gr("Hello! This is a test.", 1.0, 1.0, 1.0) # Corrected function call # print("Output path:", output_path) # This cell sets up and launches the Gradio web interface for the Text-to-Speech model. # Users can input text and adjust speech parameters, then get audio and a waveform plot as output. iface = gr.Interface( fn=synthesize_speech_gr, # Ensure this function is defined and handles all outputs correctly. inputs=[ gr.Textbox(lines=3, label="Enter Text"), # Text input field. gr.Slider(0.5, 2.0, value=1.0, label="Speed"), # Slider for controlling speech speed. gr.Slider(0.5, 2.0, value=1.0, label="Pitch"), # Slider for controlling speech pitch. gr.Slider(0.0, 2.0, value=1.0, label="Volume") # Slider for controlling speech volume. ], outputs=[ gr.Audio(type="filepath", label="Generated Speech"), # Audio output of the synthesized speech. gr.Image(label="Waveform") # Image output of the speech waveform. ], title="🗣️ Text-to-Speech (TTS) Bot with Adjustable Voice Parameters", # Title of the Gradio interface. description="Enter text and Hear the generated voice 😁✌️ | ⭐ [GitHub Repo](https://github.com/hrnrxb/TTS-with-Adjustable-Voice-Parameters) | 🌐 [My Website](https://hrnrxb.github.io)" # Description shown on the interface. ) iface.launch() # Launches the Gradio interface, making it publicly accessible via a shareable link.