Edge_TTS_NGHIA_transcript

Running

App Files Files Community

cnph001 commited on May 7

Commit

06acc33

verified ·

1 Parent(s): d9e25a8

Update app.py

Browse files

Files changed (1) hide show

app.py +15 -15

app.py CHANGED Viewed

@@ -8,6 +8,7 @@ import re
 from pathlib import Path
 from pydub import AudioSegment
 import librosa
 import numpy as np
 def get_silence(duration_ms=1000):
@@ -41,7 +42,7 @@ async def get_voices():
     voices = await edge_tts.list_voices()
     return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
-async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pitch, target_duration_ms=None):
     """Generates audio for a text segment, handling voice prefixes and adjusting rate for duration."""
     current_voice_full = default_voice
     current_voice_short = current_voice_full.split(" - ")[0] if current_voice_full else ""
@@ -104,10 +105,9 @@ async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pi
             print(f"Generated audio duration: {audio_duration_ms}ms, Target duration: {target_duration_ms}ms") # Debug
             if audio_duration_ms > target_duration_ms and target_duration_ms > 0:
-                speed_factor = (audio_duration_ms / target_duration_ms)
-                print(f"Speed factor (to reduce duration): {speed_factor}") # Debug
                 if speed_factor > 0:
-                    speed_factor = speed_factor * 0.8 # Reduce the speed adjustment
                     if speed_factor <1.0:
                        speed_factor = 1.0
                     y, sr = librosa.load(audio_path, sr=None)
@@ -119,7 +119,7 @@ async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pi
         return audio_path
     return None
-async def process_transcript_line(line, default_voice, rate, pitch):
     """Processes a single transcript line with HH:MM:SS,milliseconds - HH:MM:SS,milliseconds timestamp."""
     match = re.match(r'(\d{2}):(\d{2}):(\d{2}),(\d{3})\s+-\s+(\d{2}):(\d{2}):(\d{2}),(\d{3})\s+(.*)', line)
     if match:
@@ -145,18 +145,18 @@ async def process_transcript_line(line, default_voice, rate, pitch):
                 process_next = not process_next
                 continue
             if process_next and part.strip():
-                audio_path = await generate_audio_with_voice_prefix(part, default_voice, rate, pitch, duration_ms)
                 if audio_path:
                     audio_segments.append(audio_path)
             elif not process_next and part.strip():
-                audio_path = await generate_audio_with_voice_prefix(part, default_voice, rate, pitch, duration_ms)
                 if audio_path:
                     audio_segments.append(audio_path)
         return start_time_ms, audio_segments, duration_ms
     return None, None, None
-async def transcript_to_speech(transcript_text, voice, rate, pitch):
     if not transcript_text.strip():
         return None, gr.Warning("Please enter transcript text.")
     if not voice:
@@ -167,7 +167,7 @@ async def transcript_to_speech(transcript_text, voice, rate, pitch):
     max_end_time_ms = 0
     for line in lines:
-        start_time, audio_paths, duration = await process_transcript_line(line, voice, rate, pitch)
         if start_time is not None and audio_paths:
             combined_line_audio = AudioSegment.empty()
             current_time_ms = start_time
@@ -176,7 +176,6 @@ async def transcript_to_speech(transcript_text, voice, rate, pitch):
             for path in audio_paths:
                 try:
                     audio = AudioSegment.from_mp3(path)
-                    # No need to adjust speed here, it's done in generate_audio_with_voice_prefix
                     combined_line_audio += audio
                     os.remove(path)
                 except FileNotFoundError:
@@ -204,8 +203,8 @@ async def transcript_to_speech(transcript_text, voice, rate, pitch):
     return combined_audio_path, None
 @spaces.GPU
-def tts_interface(transcript, voice, rate, pitch):
-    audio, warning = asyncio.run(transcript_to_speech(transcript, voice, rate, pitch))
     return audio, warning
 async def create_demo():
@@ -214,6 +213,7 @@ async def create_demo():
     description = """
     Process timestamped text (HH:MM:SS,milliseconds - HH:MM:SS,milliseconds) with voice changes within quotes.
     The duration specified in the timestamp will be used to adjust the speech rate so the generated audio fits within that time.
     Format: `HH:MM:SS,milliseconds - HH:MM:SS,milliseconds "VoicePrefix Text" more text "AnotherVoicePrefix More Text"`
     Example:
     ```
@@ -243,7 +243,8 @@ async def create_demo():
             gr.Textbox(label="Timestamped Text with Voice Changes and Duration", lines=10, placeholder='00:00:00,000 - 00:00:05,000 "Text" more text "1F Different Voice"'),
             gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Default Voice", value=default_voice),
             gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1),
-            gr.Slider(minimum=-50, maximum=50, value=0, label="Pitch Adjustment (Hz)", step=1)
         ],
         outputs=[
             gr.Audio(label="Generated Audio", type="filepath"),
@@ -257,6 +258,5 @@ async def create_demo():
     return demo
 if __name__ == "__main__":
-    import soundfile as sf # Import soundfile here
     demo = asyncio.run(create_demo())
-    demo.launch()

 from pathlib import Path
 from pydub import AudioSegment
 import librosa
+import soundfile as sf
 import numpy as np
 def get_silence(duration_ms=1000):
     voices = await edge_tts.list_voices()
     return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
+async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pitch, target_duration_ms=None, speed_adjustment_factor=1.0):
     """Generates audio for a text segment, handling voice prefixes and adjusting rate for duration."""
     current_voice_full = default_voice
     current_voice_short = current_voice_full.split(" - ")[0] if current_voice_full else ""
             print(f"Generated audio duration: {audio_duration_ms}ms, Target duration: {target_duration_ms}ms") # Debug
             if audio_duration_ms > target_duration_ms and target_duration_ms > 0:
+                speed_factor = (audio_duration_ms / target_duration_ms) * speed_adjustment_factor
+                print(f"Speed factor (after user adjustment): {speed_factor}") # Debug
                 if speed_factor > 0:
                     if speed_factor <1.0:
                        speed_factor = 1.0
                     y, sr = librosa.load(audio_path, sr=None)
         return audio_path
     return None
+async def process_transcript_line(line, default_voice, rate, pitch, speed_adjustment_factor):
     """Processes a single transcript line with HH:MM:SS,milliseconds - HH:MM:SS,milliseconds timestamp."""
     match = re.match(r'(\d{2}):(\d{2}):(\d{2}),(\d{3})\s+-\s+(\d{2}):(\d{2}):(\d{2}),(\d{3})\s+(.*)', line)
     if match:
                 process_next = not process_next
                 continue
             if process_next and part.strip():
+                audio_path = await generate_audio_with_voice_prefix(part, default_voice, rate, pitch, duration_ms, speed_adjustment_factor)
                 if audio_path:
                     audio_segments.append(audio_path)
             elif not process_next and part.strip():
+                audio_path = await generate_audio_with_voice_prefix(part, default_voice, rate, pitch, duration_ms, speed_adjustment_factor)
                 if audio_path:
                     audio_segments.append(audio_path)
         return start_time_ms, audio_segments, duration_ms
     return None, None, None
+async def transcript_to_speech(transcript_text, voice, rate, pitch, speed_adjustment_factor):
     if not transcript_text.strip():
         return None, gr.Warning("Please enter transcript text.")
     if not voice:
     max_end_time_ms = 0
     for line in lines:
+        start_time, audio_paths, duration = await process_transcript_line(line, voice, rate, pitch, speed_adjustment_factor)
         if start_time is not None and audio_paths:
             combined_line_audio = AudioSegment.empty()
             current_time_ms = start_time
             for path in audio_paths:
                 try:
                     audio = AudioSegment.from_mp3(path)
                     combined_line_audio += audio
                     os.remove(path)
                 except FileNotFoundError:
     return combined_audio_path, None
 @spaces.GPU
+def tts_interface(transcript, voice, rate, pitch, speed_adjustment_factor):
+    audio, warning = asyncio.run(transcript_to_speech(transcript, voice, rate, pitch, speed_adjustment_factor))
     return audio, warning
 async def create_demo():
     description = """
     Process timestamped text (HH:MM:SS,milliseconds - HH:MM:SS,milliseconds) with voice changes within quotes.
     The duration specified in the timestamp will be used to adjust the speech rate so the generated audio fits within that time.
+    You can control the intensity of the speed adjustment using the "Speed Adjustment Factor" slider.
     Format: `HH:MM:SS,milliseconds - HH:MM:SS,milliseconds "VoicePrefix Text" more text "AnotherVoicePrefix More Text"`
     Example:
     ```
             gr.Textbox(label="Timestamped Text with Voice Changes and Duration", lines=10, placeholder='00:00:00,000 - 00:00:05,000 "Text" more text "1F Different Voice"'),
             gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Default Voice", value=default_voice),
             gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1),
+            gr.Slider(minimum=-50, maximum=50, value=0, label="Pitch Adjustment (Hz)", step=1),
+            gr.Slider(minimum=0.5, maximum=1.5, value=1.0, step=0.05, label="Speed Adjustment Factor")
         ],
         outputs=[
             gr.Audio(label="Generated Audio", type="filepath"),
     return demo
 if __name__ == "__main__":
     demo = asyncio.run(create_demo())
+    demo.launch()