Edge_TTS_NGHIA_transcript

Running

App Files Files Community

cnph001 commited on May 8

Commit

298c01a

verified ·

1 Parent(s): 0a995d3

Update app.py

Browse files

Files changed (1) hide show

app.py +108 -127

app.py CHANGED Viewed

@@ -11,32 +11,35 @@ import librosa
 import soundfile as sf
 import numpy as np
 def get_silence(duration_ms=1000):
-    # Create silent audio segment with specified parameters
-    silent_audio = AudioSegment.silent(
         duration=duration_ms,
-        frame_rate=24000  # 24kHz sampling rate
     )
-    # Set audio parameters
-    silent_audio = silent_audio.set_channels(1)  # Mono
-    silent_audio = silent_audio.set_sample_width(4)  # 32-bit (4 bytes per sample)
-    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
-        # Export with specific bitrate and codec parameters
-        silent_audio.export(
-            tmp_file.name,
-            format="mp3",
-            bitrate="48k",
-            parameters=[
-                "-ac", "1",  # Mono
-                "-ar", "24000",  # Sample rate
-                "-sample_fmt", "s32",  # 32-bit samples
-                "-codec:a", "libmp3lame"  # MP3 codec
-            ]
-        )
-        return tmp_file.name
-# Get all available voices
 async def get_voices():
     try:
         voices = await edge_tts.list_voices()
         return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
@@ -46,30 +49,12 @@ async def get_voices():
 async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pitch, target_duration_ms=None, speed_adjustment_factor=1.0):
     """Generates audio for a text segment, handling voice prefixes and adjusting rate for duration."""
-    current_voice_full = default_voice
-    current_voice_short = current_voice_full.split(" - ")[0] if current_voice_full else ""
     current_rate = rate
     current_pitch = pitch
-    processed_text = text_segment.strip()
-    print(f"Processing this  text segment: {processed_text}") # Debug
-    voice_map = {
-        "1F": "en-GB-SoniaNeural",
-        "2M": "en-GB-RyanNeural",
-        "3M": "en-US-BrianMultilingualNeural",
-        "2F": "en-US-JennyNeural",
-        "1M": "en-AU-WilliamNeural",
-        "3F": "en-HK-YanNeural",
-        "4M": "en-GB-ThomasNeural",
-        "4F": "en-US-EmmaNeural",
-        "1O": "en-GB-RyanNeural",  # Old Man
-        "1C": "en-GB-MaisieNeural",  # Child
-        "1V": "vi-VN-HoaiMyNeural",  # Vietnamese (Female)
-        "2V": "vi-VN-NamMinhNeural",  # Vietnamese (Male)
-        "3V": "vi-VN-HoaiMyNeural",  # Vietnamese (Female)
-        "4V": "vi-VN-NamMinhNeural",  # Vietnamese (Male)
-    }
-    detect = 0
-    for prefix, voice_short in voice_map.items():
         if processed_text.startswith(prefix):
             current_voice_short = voice_short
             if prefix in ["1F", "3F", "1V", "3V"]:
@@ -77,20 +62,17 @@ async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pi
             elif prefix in ["1O", "4V"]:
                 current_pitch = -20
                 current_rate = -10
-            detect = 1
             processed_text = processed_text[len(prefix):].strip()
             break
     match = re.search(r'([A-Za-z]+)-?(\d+)', processed_text)
-    if match:
-        prefix_pitch = match.group(1)
-        number = int(match.group(2))
-        if prefix_pitch in voice_map:
-            current_pitch += number
-            processed_text = re.sub(r'[A-Za-z]+-?\d+', '', processed_text, count=1).strip()
-        elif detect:
-            processed_text = processed_text.lstrip('-0123456789').strip() # Remove potential leftover numbers
-    elif detect:
-        processed_text = processed_text[2:].strip()
     if processed_text:
         rate_str = f"{current_rate:+d}%"
         pitch_str = f"{current_pitch:+d}Hz"
@@ -99,110 +81,109 @@ async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pi
             with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
                 audio_path = tmp_file.name
                 await communicate.save(audio_path)
-            if target_duration_ms is not None and os.path.exists(audio_path):
-                audio = AudioSegment.from_mp3(audio_path)
-                audio_duration_ms = len(audio)
-                #print(f"Generated audio duration: {audio_duration_ms}ms, Target duration: {target_duration_ms}ms") # Debug
-                if audio_duration_ms > target_duration_ms and target_duration_ms > 0:
-                    speed_factor = (audio_duration_ms / target_duration_ms) * speed_adjustment_factor
-                    #print(f"Speed factor (after user adjustment): {speed_factor}") # Debug
-                    if speed_factor > 0:
-                        if speed_factor < 1.0:
-                            speed_factor = 1.0
-                        y, sr = librosa.load(audio_path, sr=None)
-                        y_stretched = librosa.effects.time_stretch(y, rate=speed_factor)
-                        sf.write(audio_path, y_stretched, sr)
-                else:
-                    print("Generated audio is not longer than target duration, no speed adjustment.") # Debug
-            return audio_path
         except Exception as e:
             print(f"Edge TTS error processing '{processed_text}': {e}")
             return None
     return None
 async def process_transcript_line(line, default_voice, rate, pitch, speed_adjustment_factor):
-    """Processes a single transcript line with HH:MM:SS,milliseconds - HH:MM:SS,milliseconds timestamp."""
-    match = re.match(r'(\d{2}):(\d{2}):(\d{2}),(\d{3})\s+-\s+(\d{2}):(\d{2}):(\d{2}),(\d{3})\s+(.*)', line)
     if match:
-        start_h, start_m, start_s, start_ms, end_h, end_m, end_s, end_ms, text_parts = match.groups()
-        start_time_ms = (
-            int(start_h) * 3600000 +
-            int(start_m) * 60000 +
-            int(start_s) * 1000 +
-            int(start_ms)
-        )
-        end_time_ms = (
-            int(end_h) * 3600000 +
-            int(end_m) * 60000 +
-            int(end_s) * 1000 +
-            int(end_ms)
-        )
         duration_ms = end_time_ms - start_time_ms
         audio_segments = []
-        split_parts = re.split(r'[“”"]', text_parts)
-        process_next = False
-        for part in split_parts:
             if part == '"':
-                process_next = not process_next
                 continue
-            if process_next and part.strip():
-                audio_path = await generate_audio_with_voice_prefix(part, default_voice, rate, pitch, duration_ms, speed_adjustment_factor)
-                if audio_path:
-                    audio_segments.append(audio_path)
-            elif not process_next and part.strip():
-                audio_path = await generate_audio_with_voice_prefix(part, default_voice, rate, pitch, duration_ms, speed_adjustment_factor)
                 if audio_path:
                     audio_segments.append(audio_path)
         return start_time_ms, audio_segments, duration_ms
     return None, None, None
 async def transcript_to_speech(transcript_text, voice, rate, pitch, speed_adjustment_factor):
     if not transcript_text.strip():
         return None, gr.Warning("Please enter transcript text.")
     if not voice:
         return None, gr.Warning("Please select a voice.")
     lines = transcript_text.strip().split('\n')
     timed_audio_segments = []
     max_end_time_ms = 0
-    for line in lines:
-        start_time, audio_paths, duration = await process_transcript_line(line, voice, rate, pitch, speed_adjustment_factor)
-        if start_time is not None and audio_paths:
-            combined_line_audio = AudioSegment.empty()
-            current_time_ms = start_time
-            segment_duration = duration / len(audio_paths) if audio_paths else 0
-            for path in audio_paths:
-                if path:  # Only process if audio_path is not None (meaning TTS was successful)
-                    try:
-                        audio = AudioSegment.from_mp3(path)
-                        combined_line_audio += audio
-                        os.remove(path)
-                    except FileNotFoundError:
-                        print(f"Warning: Audio file not found: {path}")
-            if combined_line_audio:
-                timed_audio_segments.append({'start': start_time, 'audio': combined_line_audio})
-                max_end_time_ms = max(max_end_time_ms, start_time + len(combined_line_audio))
-        elif audio_paths:
-            for path in audio_paths:
-                if path:
-                    try:
-                        os.remove(path)
-                    except FileNotFoundError:
-                        pass # Clean up even if no timestamp
-    if not timed_audio_segments:
-        return None, "No processable audio segments found."
-    final_audio = AudioSegment.silent(duration=max_end_time_ms, frame_rate=24000)
-    for segment in timed_audio_segments:
-        final_audio = final_audio.overlay(segment['audio'], position=segment['start'])
-    combined_audio_path = tempfile.mktemp(suffix=".mp3")
-    final_audio.export(combined_audio_path, format="mp3")
-    return combined_audio_path, None
 @spaces.GPU
 def tts_interface(transcript, voice, rate, pitch, speed_adjustment_factor):
     audio, warning = asyncio.run(transcript_to_speech(transcript, voice, rate, pitch, speed_adjustment_factor))
     return audio, warning
 async def create_demo():
     voices = await get_voices()
     default_voice = "en-US-AndrewMultilingualNeural - en-US (Male)"
     description = """

 import soundfile as sf
 import numpy as np
+# Global constant for voice mapping
+VOICE_MAP = {
+    "1F": "en-GB-SoniaNeural",
+    "2M": "en-GB-RyanNeural",
+    "3M": "en-US-BrianMultilingualNeural",
+    "2F": "en-US-JennyNeural",
+    "1M": "en-AU-WilliamNeural",
+    "3F": "en-HK-YanNeural",
+    "4M": "en-GB-ThomasNeural",
+    "4F": "en-US-EmmaNeural",
+    "1O": "en-GB-RyanNeural",  # Old Man
+    "1C": "en-GB-MaisieNeural",  # Child
+    "1V": "vi-VN-HoaiMyNeural",  # Vietnamese (Female)
+    "2V": "vi-VN-NamMinhNeural",  # Vietnamese (Male)
+    "3V": "vi-VN-HoaiMyNeural",  # Vietnamese (Female)
+    "4V": "vi-VN-NamMinhNeural",  # Vietnamese (Male)
+}
 def get_silence(duration_ms=1000):
+    """Creates a silent AudioSegment."""
+    return AudioSegment.silent(
         duration=duration_ms,
+        frame_rate=24000,
+        sample_width=4,
+        channels=1
     )
 async def get_voices():
+    """Lists available Edge TTS voices."""
     try:
         voices = await edge_tts.list_voices()
         return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
 async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pitch, target_duration_ms=None, speed_adjustment_factor=1.0):
     """Generates audio for a text segment, handling voice prefixes and adjusting rate for duration."""
+    processed_text = text_segment.strip()
+    current_voice_short = default_voice.split(" - ")[0] if default_voice else ""
     current_rate = rate
     current_pitch = pitch
+    for prefix, voice_short in VOICE_MAP.items():
         if processed_text.startswith(prefix):
             current_voice_short = voice_short
             if prefix in ["1F", "3F", "1V", "3V"]:
             elif prefix in ["1O", "4V"]:
                 current_pitch = -20
                 current_rate = -10
             processed_text = processed_text[len(prefix):].strip()
             break
     match = re.search(r'([A-Za-z]+)-?(\d+)', processed_text)
+    if match and match.group(1) in VOICE_MAP:
+        pitch_adjustment = int(match.group(2))
+        current_pitch += pitch_adjustment
+        processed_text = re.sub(r'[A-Za-z]+-?\d+', '', processed_text, count=1).strip()
+    elif any(processed_text.startswith(prefix) for prefix in VOICE_MAP): # Handle leftover prefixes
+        processed_text = re.sub(r'^[A-Za-z]{1,2}', '', processed_text).lstrip('-').strip()
     if processed_text:
         rate_str = f"{current_rate:+d}%"
         pitch_str = f"{current_pitch:+d}Hz"
             with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
                 audio_path = tmp_file.name
                 await communicate.save(audio_path)
+                if target_duration_ms is not None and os.path.exists(audio_path) and target_duration_ms > 0:
+                    audio = AudioSegment.from_mp3(audio_path)
+                    audio_duration_ms = len(audio)
+                    if audio_duration_ms > target_duration_ms:
+                        speed_factor = (audio_duration_ms / target_duration_ms) * speed_adjustment_factor
+                        if speed_factor > 0 and speed_factor >= 1.0:
+                            y, sr = librosa.load(audio_path, sr=None)
+                            y_stretched = librosa.effects.time_stretch(y, rate=speed_factor)
+                            sf.write(audio_path, y_stretched, sr)
+                return audio_path
         except Exception as e:
             print(f"Edge TTS error processing '{processed_text}': {e}")
             return None
     return None
 async def process_transcript_line(line, default_voice, rate, pitch, speed_adjustment_factor):
+    """Processes a single transcript line with timestamp and potential voice changes."""
+    match = re.match(r'(\d{2}:\d{2}:\d{2},\d{3})\s+-\s+(\d{2}:\d{2}:\d{2},\d{3})\s+(.*)', line)
     if match:
+        start_time_str, end_time_str, text_parts = match.groups()
+        def time_str_to_ms(time_str):
+            h, m, s_ms = time_str.split(':')
+            s, ms = s_ms.split(',')
+            return int(h) * 3600000 + int(m) * 60000 + int(s) * 1000 + int(ms)
+        start_time_ms = time_str_to_ms(start_time_str)
+        end_time_ms = time_str_to_ms(end_time_str)
         duration_ms = end_time_ms - start_time_ms
         audio_segments = []
+        parts = re.split(r'([“”"])', text_parts)
+        in_quote = False
+        for part in parts:
             if part == '"':
+                in_quote = not in_quote
                 continue
+            if part.strip():
+                audio_path = await generate_audio_with_voice_prefix(part, default_voice, rate, pitch, duration_ms, speed_adjustment_factor if in_quote else 1.0)
                 if audio_path:
                     audio_segments.append(audio_path)
         return start_time_ms, audio_segments, duration_ms
     return None, None, None
 async def transcript_to_speech(transcript_text, voice, rate, pitch, speed_adjustment_factor):
+    """Converts a timestamped transcript with voice changes to a single audio file."""
     if not transcript_text.strip():
         return None, gr.Warning("Please enter transcript text.")
     if not voice:
         return None, gr.Warning("Please select a voice.")
     lines = transcript_text.strip().split('\n')
     timed_audio_segments = []
     max_end_time_ms = 0
+    with tempfile.TemporaryDirectory() as tmpdir:
+        for line in lines:
+            start_time, audio_paths, duration = await process_transcript_line(line, voice, rate, pitch, speed_adjustment_factor)
+            if start_time is not None and audio_paths:
+                combined_line_audio = AudioSegment.empty()
+                for path in audio_paths:
+                    if path and os.path.exists(path):
+                        try:
+                            audio = AudioSegment.from_mp3(path)
+                            combined_line_audio += audio
+                        except FileNotFoundError:
+                            print(f"Warning: Audio file not found: {path}")
+                        finally:
+                            try:
+                                os.remove(path)
+                            except OSError:
+                                print(f"Warning: Could not remove temporary file: {path}")
+                if combined_line_audio:
+                    timed_audio_segments.append({'start': start_time, 'audio': combined_line_audio})
+                    max_end_time_ms = max(max_end_time_ms, start_time + len(combined_line_audio))
+            elif audio_paths:
+                for path in audio_paths:
+                    if path:
+                        try:
+                            os.remove(path)
+                        except FileNotFoundError:
+                            pass # Clean up even if no timestamp
+        if not timed_audio_segments:
+            return None, "No processable audio segments found."
+        final_audio = AudioSegment.silent(duration=max_end_time_ms, frame_rate=24000)
+        for segment in timed_audio_segments:
+            final_audio = final_audio.overlay(segment['audio'], position=segment['start'])
+        combined_audio_path = Path(tmpdir) / "combined_audio.mp3"
+        final_audio.export(str(combined_audio_path), format="mp3")
+        return str(combined_audio_path), None
 @spaces.GPU
 def tts_interface(transcript, voice, rate, pitch, speed_adjustment_factor):
+    """Gradio interface function for TTS."""
     audio, warning = asyncio.run(transcript_to_speech(transcript, voice, rate, pitch, speed_adjustment_factor))
     return audio, warning
 async def create_demo():
+    """Creates the Gradio demo interface."""
     voices = await get_voices()
     default_voice = "en-US-AndrewMultilingualNeural - en-US (Male)"
     description = """