Edge_TTS_NGHIA_transcript

Running

App Files Files Community

cnph001 commited on Apr 24

Commit

4337b98

verified ·

1 Parent(s): 3e2b3a7

Update app.py

Browse files

Files changed (1) hide show

app.py +107 -91

app.py CHANGED Viewed

@@ -1,129 +1,145 @@
-import spaces
-import gradio as gr
-import edge_tts
 import asyncio
-import tempfile
 import os
-import re  # Import the regular expression module
-# Get all available voices
-async def get_voices():
-    voices = await edge_tts.list_voices()
-    return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
-# Text-to-speech function for a single paragraph
-async def paragraph_to_speech(text, voice, rate, pitch):
-    voice1 ="en-US-AndrewNeural - en-US (Male)"  #good for reading
-    voice1F ="en-US-EmmaNeural - en-US (Female)"
     voice2 = "en-US-BrianNeural - en-US (Male)"
-    voice2F = "en-US-JennyNeural - en-US (Female)"
-    voice3 = "en-AU-WilliamNeural - en-AU (Male)"
     voice3F = "en-HK-YanNeural - en-HK (Female)"
-    voice4 = "en-GB-MaisieNeural - en-GB (Female)"  #Child
     if not text.strip():
         return None
     if text.startswith("1F"):
-        text2 = text[2:]  # Remove the first two characters ("FF")
-        voice_short_name =voice1F.split(" - ")[0]
     elif text.startswith("2F"):
-        text2 = text[2:]  # Remove the first two characters ("FF")
-        voice_short_name =voice2F.split(" - ")[0]
     elif text.startswith("3F"):
-        text2 = text[2:]  # Remove the first two characters ("FF")
-        voice_short_name =voice3F.split(" - ")[0]
     elif text.startswith("1M"):
-        text2 = text[2:]  # Remove the first two characters ("FF")
-        voice_short_name =voice2.split(" - ")[0]
     elif text.startswith("2M"):
-        text2 = text[2:]  # Remove the first two characters ("FF")
-        voice_short_name =voice3.split(" - ")[0]
     elif text.startswith("1C"):
-        text2 = text[2:]  # Remove the first two characters ("FF")
-        voice_short_name =voice4.split(" - ")[0]
     else:
-        # Use selected voice, or fallback to default
-        voice_short_name = (voice or default_voice).split(" - ")[0]
-        text2=text
     rate_str = f"{rate:+d}%"
     pitch_str = f"{pitch:+d}Hz"
-    communicate = edge_tts.Communicate(text2, voice_short_name, rate=rate_str, pitch=pitch_str)
-    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
-        tmp_path = tmp_file.name
-        await communicate.save(tmp_path)
-    return tmp_path
-# Main text-to-speech function that processes paragraphs
 async def text_to_speech(text, voice, rate, pitch):
     if not text.strip():
         return None, gr.Warning("Please enter text to convert.")
     if not voice:
         return None, gr.Warning("Please select a voice.")
-    # Split by two or more newline characters, optionally preceded by carriage returns
-    paragraphs = [p for p in re.split(r'\r?\n\r?\n+', text) if p.strip()]
     audio_files = []
-    for paragraph in paragraphs:
-        audio_path = await paragraph_to_speech(paragraph, voice, rate, pitch)
         if audio_path:
             audio_files.append(audio_path)
     if not audio_files:
-        return None, None  # No audio generated
-    # Combine audio files if there are multiple paragraphs
     if len(audio_files) == 1:
         return audio_files[0], None
     else:
-        # Simple concatenation for now - consider using a proper audio editing library for smoother transitions
         combined_audio_path = tempfile.mktemp(suffix=".mp3")
         with open(combined_audio_path, 'wb') as outfile:
             for filename in audio_files:
-                with open(filename, 'rb') as infile:
-                    outfile.write(infile.read())
-                os.remove(filename)  # Clean up individual files
         return combined_audio_path, None
-# Gradio interface function
-@spaces.GPU
-def tts_interface(text, voice, rate, pitch):
-    audio, warning = asyncio.run(text_to_speech(text, voice, rate, pitch))
-    return audio, warning
-# Create Gradio application
-import gradio as gr
-async def create_demo():
-    voices = await get_voices()
-    default_voice = "en-US-AndrewNeural - en-US (Male)"  # 👈 Pick one of the available voices
-    description = """
-    Default = male, other voices 1F:US_Emma, 2F:US_Jenny, 3F:HK_Jan, 1M:US_Guy, 2M:AU_William, 1C: Childvoice
-    Enter your text, select a voice, and adjust the speech rate and pitch.
-    The application will process your text paragraph by paragraph (separated by two blank lines).
-    """
-    demo = gr.Interface(
-        fn=tts_interface,
-        inputs=[
-            gr.Textbox(label="Input Text", lines=5, placeholder="Separate paragraphs with two blank lines."),
-            gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice", value=default_voice),
-            gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1),
-            gr.Slider(minimum=-20, maximum=20, value=0, label="Pitch Adjustment (Hz)", step=1)
-        ],
-        outputs=[
-            gr.Audio(label="Generated Audio", type="filepath"),
-            gr.Markdown(label="Warning", visible=False)
-        ],
-        title="Voicecloning.be Text-to-Speech (Paragraph by Paragraph)",
-        description=description,
-        article="Process text paragraph by paragraph for smoother output.",
-        analytics_enabled=False,
-        allow_flagging=False
-    )
-    return demo
-# Run the application
-if __name__ == "__main__":
-    demo = asyncio.run(create_demo())
-    demo.launch()

 import asyncio
 import os
+import re
+import tempfile
+import edge_tts
+import gradio as gr
+# Default voice
+default_voice = "en-US-AndrewNeural - en-US (Male)"
+# Text-to-speech function for a single segment
+async def process_speech_segment(text, voice, rate, pitch):
+    """
+    Processes a single segment of text (either a quote or regular text)
+    and generates speech using edge-tts.
+    Args:
+        text (str): The text to be converted to speech.
+        voice (str): The voice to use (can be overridden by prefixes).
+        rate (int): The speech rate.
+        pitch (int): The speech pitch.
+    Returns:
+        str: The path to the generated audio file, or None on error.
+    """
+    voice1 = "en-US-AndrewNeural - en-US (Male)"  # good for reading
+    voice1F = "en-US-EmmaNeural - en-US (Female)"
     voice2 = "en-US-BrianNeural - en-US (Male)"
+    voice2F = "en-US-JennyNeural - en-US (Female)"
+    voice3 = "en-AU-WilliamNeural - en-AU (Male)"
     voice3F = "en-HK-YanNeural - en-HK (Female)"
+    voice4 = "en-GB-MaisieNeural - en-GB (Female)"  # Child
     if not text.strip():
         return None
+    voice_short_name = voice.split(" - ")[0] #default
     if text.startswith("1F"):
+        text2 = text[2:].strip()
+        voice_short_name = voice1F.split(" - ")[0]
     elif text.startswith("2F"):
+        text2 = text[2:].strip()
+        voice_short_name = voice2F.split(" - ")[0]
     elif text.startswith("3F"):
+        text2 = text[2:].strip()
+        voice_short_name = voice3F.split(" - ")[0]
     elif text.startswith("1M"):
+        text2 = text[2:].strip()
+        voice_short_name = voice2.split(" - ")[0]
     elif text.startswith("2M"):
+        text2 = text[2:].strip()
+        voice_short_name = voice3.split(" - ")[0]
     elif text.startswith("1C"):
+        text2 = text[2:].strip()
+        voice_short_name = voice4.split(" - ")[0]
     else:
+        text2 = text
     rate_str = f"{rate:+d}%"
     pitch_str = f"{pitch:+d}Hz"
+    try:
+        communicate = edge_tts.Communicate(text2, voice=voice_short_name, rate=rate_str, pitch=pitch_str) #removed async
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
+            tmp_path = tmp_file.name
+            await communicate.save(tmp_path) #added await
+        return tmp_path
+    except Exception as e:
+        print(f"Error processing segment: {e}")  # Log the error
+        return None
+# Main text-to-speech function
 async def text_to_speech(text, voice, rate, pitch):
+    """
+    Processes the input text, identifying quoted sections for different voices,
+    and generates combined audio.
+    Args:
+        text (str): The input text.
+        voice (str): The default voice.
+        rate (int): The speech rate.
+        pitch (int): The speech pitch.
+    Returns:
+        tuple: (audio_path, error_message) where audio_path is the path to the
+               combined audio file, and error_message is any error encountered.
+    """
     if not text.strip():
         return None, gr.Warning("Please enter text to convert.")
     if not voice:
         return None, gr.Warning("Please select a voice.")
     audio_files = []
+    segments = []
+    i = 0
+    while i < len(text):
+        if text[i] == '"':
+            # Find the closing quote
+            j = i + 1
+            while j < len(text) and text[j] != '"':
+                j += 1
+            if j < len(text):
+                segments.append(("quote", text[i + 1:j]))
+                i = j + 1
+            else:
+                segments.append(("text", text[i:]))  # Handle unclosed quote
+                i = j
+        else:
+            # Find the end of the non-quote text
+            j = i + 1
+            while j < len(text) and text[j] != '"':
+                j += 1
+            segments.append(("text", text[i:j]))
+            i = j
+    for segment_type, segment_text in segments:
+        if segment_type == "quote":
+            # Determine the voice based on the prefix within the quote.
+            voice_prefix = ""
+            if segment_text.startswith("1F") or segment_text.startswith("2F") or segment_text.startswith("3F") or segment_text.startswith("1M") or segment_text.startswith("2M") or segment_text.startswith("1C"):
+                voice_prefix = segment_text[:2]
+            audio_path = await process_speech_segment(segment_text, voice, rate, pitch)
+        else:
+            audio_path = await process_speech_segment(segment_text, voice, rate, pitch)
         if audio_path:
             audio_files.append(audio_path)
     if not audio_files:
+        return None, None
     if len(audio_files) == 1:
         return audio_files[0], None
     else:
+        # Combine audio files
         combined_audio_path = tempfile.mktemp(suffix=".mp3")
         with open(combined_audio_path, 'wb') as outfile:
             for filename in audio_files:
+                try:
+                    with open(filename, 'rb') as infile:
+                        outfile.write(infile.read())
+                    os.remove(filename)  # Clean up individual files
+                except Exception as e:
+                    print(f"Error combining audio files: {e}")
+                    return None, gr.Error(f"Error combining audio files: {e}")
         return combined_audio_path, None