Edge_TTS_NGHIA_transcript

Running

App Files Files Community

cnph001 commited on May 11

Commit

0d2cfad

verified ·

1 Parent(s): e4c6d2d

Update app.py

Browse files

Files changed (1) hide show

app.py +57 -45

app.py CHANGED Viewed

@@ -47,7 +47,7 @@ async def get_voices():
         print(f"Error listing voices: {e}")
         return {}
-async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pitch, target_duration_ms=None, speed_adjustment_factor=1.0):
     """Generates audio for a text segment, handling voice prefixes and adjusting rate for duration."""
     current_voice_full = default_voice
     current_voice_short = current_voice_full.split(" - ")[0] if current_voice_full else ""
@@ -102,29 +102,18 @@ async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pi
             with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
                 audio_path = tmp_file.name
                 await communicate.save(audio_path)
-            if target_duration_ms is not None and os.path.exists(audio_path) and target_duration_ms > 0:
                 audio = AudioSegment.from_mp3(audio_path)
                 audio_duration_ms = len(audio)
-                #print(f"Generated audio duration: {audio_duration_ms}ms, Target duration: {target_duration_ms}ms") # Debug
-                if audio_duration_ms > target_duration_ms:
-                    speed_factor = (audio_duration_ms / target_duration_ms) * speed_adjustment_factor
-                    #print(f"Speed factor (after user adjustment): {speed_factor}") # Debug
-                    if speed_factor > 0:
-                        if speed_factor < 1.0:
-                            speed_factor = 1.0
-                        audio = AudioSegment.from_file(audio_path)
-                        audio_stretched = audio.speedup(playback_speed=speed_factor)
-                        audio_stretched.export(audio_path, format="mp3")
-                else:
-                    print("Generated audio is not longer than target duration, no speed adjustment.") # Debug
             return audio_path
         except Exception as e:
             print(f"Edge TTS error processing '{processed_text}': {e}")
             return None
     return None
-async def process_transcript_line(line, next_line_start_time, default_voice, rate, pitch, speed_adjustment_factor):
     """Processes a single transcript line with HH:MM:SS,milliseconds timestamp."""
     match = re.match(r'(\d{2}):(\d{2}):(\d{2}),(\d{3})\s+(.*)', line)
     if match:
@@ -135,9 +124,6 @@ async def process_transcript_line(line, next_line_start_time, default_voice, rat
             int(start_s) * 1000 +
             int(start_ms)
         )
-        duration_ms = None
-        if next_line_start_time is not None:
-            duration_ms = next_line_start_time - start_time_ms
         audio_segments = []
         split_parts = re.split(r'[“”"]', text_parts)
@@ -147,14 +133,14 @@ async def process_transcript_line(line, next_line_start_time, default_voice, rat
                 process_next = not process_next
                 continue
             if process_next and part.strip():
-                audio_path = await generate_audio_with_voice_prefix(part, default_voice, rate, pitch, duration_ms, speed_adjustment_factor)
                 if audio_path:
                     audio_segments.append(audio_path)
             elif not process_next and part.strip():
-                audio_path = await generate_audio_with_voice_prefix(part, default_voice, rate, pitch, duration_ms, speed_adjustment_factor)
                 if audio_path:
                     audio_segments.append(audio_path)
-        return start_time_ms, audio_segments, duration_ms
     return None, None, None
 async def transcript_to_speech(transcript_text, voice, rate, pitch, speed_adjustment_factor):
@@ -165,6 +151,7 @@ async def transcript_to_speech(transcript_text, voice, rate, pitch, speed_adjust
     lines = transcript_text.strip().split('\n')
     timed_audio_segments = []
     max_end_time_ms = 0
     for i, line in enumerate(lines):
         next_line_start_time = None
         if i < len(lines) - 1:
@@ -178,27 +165,52 @@ async def transcript_to_speech(transcript_text, voice, rate, pitch, speed_adjust
                     int(nms)
                 )
-        start_time, audio_paths, duration = await process_transcript_line(line, next_line_start_time, voice, rate, pitch, speed_adjustment_factor)
-        if start_time is not None and audio_paths:
-            combined_line_audio = AudioSegment.empty()
-            for path in audio_paths:
-                if path:  # Only process if audio_path is not None (meaning TTS was successful)
-                    try:
-                        audio = AudioSegment.from_mp3(path)
-                        combined_line_audio += audio
-                        os.remove(path)
-                    except FileNotFoundError:
-                        print(f"Warning: Audio file not found: {path}")
-            if combined_line_audio:
-                timed_audio_segments.append({'start': start_time, 'audio': combined_line_audio})
-                max_end_time_ms = max(max_end_time_ms, start_time + len(combined_line_audio))
-        elif audio_paths:
-            for path in audio_paths:
-                if path:
-                    try:
-                        os.remove(path)
-                    except FileNotFoundError:
-                        pass # Clean up even if no timestamp
     if not timed_audio_segments:
         return None, "No processable audio segments found."
@@ -221,8 +233,8 @@ async def create_demo():
     default_voice = "en-US-AndrewMultilingualNeural - en-US (Male)"
     description = """
     Process timestamped text (HH:MM:SS,milliseconds) with voice changes within quotes.
-    The duration for each segment is determined by the timestamp of the following line.
-    The speed of the generated audio will be adjusted to fit within this duration.
     If there is no subsequent timestamp, the speed adjustment will be skipped.
     You can control the intensity of the speed adjustment using the "Speed Adjustment Factor" slider.
     Format: `HH:MM:SS,milliseconds "VoicePrefix Text" more text "AnotherVoicePrefix More Text"`
@@ -261,7 +273,7 @@ async def create_demo():
             gr.Audio(label="Generated Audio", type="filepath"),
             gr.Markdown(label="Warning", visible=False)
         ],
-        title="TTS with Dynamic Duration and In-Quote Voice Switching",
         description=description,
         analytics_enabled=False,
         allow_flagging=False

         print(f"Error listing voices: {e}")
         return {}
+async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pitch, overall_target_duration_ms=None, speed_adjustment_factor=1.0):
     """Generates audio for a text segment, handling voice prefixes and adjusting rate for duration."""
     current_voice_full = default_voice
     current_voice_short = current_voice_full.split(" - ")[0] if current_voice_full else ""
             with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
                 audio_path = tmp_file.name
                 await communicate.save(audio_path)
+            if overall_target_duration_ms is not None and os.path.exists(audio_path) and overall_target_duration_ms > 0:
                 audio = AudioSegment.from_mp3(audio_path)
                 audio_duration_ms = len(audio)
+                # We don't do the stretching here anymore for individual segments
             return audio_path
         except Exception as e:
             print(f"Edge TTS error processing '{processed_text}': {e}")
             return None
     return None
+async def process_transcript_line(line, next_line_start_time, default_voice, rate, pitch, overall_duration_ms, speed_adjustment_factor):
     """Processes a single transcript line with HH:MM:SS,milliseconds timestamp."""
     match = re.match(r'(\d{2}):(\d{2}):(\d{2}),(\d{3})\s+(.*)', line)
     if match:
             int(start_s) * 1000 +
             int(start_ms)
         )
         audio_segments = []
         split_parts = re.split(r'[“”"]', text_parts)
                 process_next = not process_next
                 continue
             if process_next and part.strip():
+                audio_path = await generate_audio_with_voice_prefix(part, default_voice, rate, pitch, overall_duration_ms, speed_adjustment_factor)
                 if audio_path:
                     audio_segments.append(audio_path)
             elif not process_next and part.strip():
+                audio_path = await generate_audio_with_voice_prefix(part, default_voice, rate, pitch, overall_duration_ms, speed_adjustment_factor)
                 if audio_path:
                     audio_segments.append(audio_path)
+        return start_time_ms, audio_segments, overall_duration_ms
     return None, None, None
 async def transcript_to_speech(transcript_text, voice, rate, pitch, speed_adjustment_factor):
     lines = transcript_text.strip().split('\n')
     timed_audio_segments = []
     max_end_time_ms = 0
     for i, line in enumerate(lines):
         next_line_start_time = None
         if i < len(lines) - 1:
                     int(nms)
                 )
+        current_line_match = re.match(r'(\d{2}):(\d{2}):(\d{2}),(\d{3})\s+(.*)', line)
+        if current_line_match:
+            sh, sm, ss, sms, text_content = current_line_match.groups()
+            start_time_ms = (
+                int(sh) * 3600000 +
+                int(sm) * 60000 +
+                int(ss) * 1000 +
+                int(sms)
+            )
+            overall_duration_ms = None
+            if next_line_start_time is not None:
+                overall_duration_ms = next_line_start_time - start_time_ms
+            start_time, audio_paths, duration = await process_transcript_line(line, next_line_start_time, voice, rate, pitch, overall_duration_ms, speed_adjustment_factor)
+            if start_time is not None and audio_paths:
+                combined_line_audio = AudioSegment.empty()
+                total_generated_duration_ms = 0
+                for path in audio_paths:
+                    if path:
+                        try:
+                            audio = AudioSegment.from_mp3(path)
+                            combined_line_audio += audio
+                            total_generated_duration_ms += len(audio)
+                            os.remove(path)
+                        except FileNotFoundError:
+                            print(f"Warning: Audio file not found: {path}")
+                if combined_line_audio and overall_duration_ms is not None and overall_duration_ms > 0 and total_generated_duration_ms > overall_duration_ms:
+                    speed_factor = (total_generated_duration_ms / overall_duration_ms) * speed_adjustment_factor
+                    if speed_factor > 0:
+                        if speed_factor < 1.0:
+                            speed_factor = 1.0
+                        combined_line_audio = combined_line_audio.speedup(playback_speed=speed_factor)
+                if combined_line_audio:
+                    timed_audio_segments.append({'start': start_time, 'audio': combined_line_audio})
+                    max_end_time_ms = max(max_end_time_ms, start_time + len(combined_line_audio))
+            elif audio_paths:
+                for path in audio_paths:
+                    if path:
+                        try:
+                            os.remove(path)
+                        except FileNotFoundError:
+                            pass # Clean up even if no timestamp
     if not timed_audio_segments:
         return None, "No processable audio segments found."
     default_voice = "en-US-AndrewMultilingualNeural - en-US (Male)"
     description = """
     Process timestamped text (HH:MM:SS,milliseconds) with voice changes within quotes.
+    The duration for each line is determined by the timestamp of the following line.
+    The speed of the ENTIRE generated audio for a line will be adjusted to fit within this duration.
     If there is no subsequent timestamp, the speed adjustment will be skipped.
     You can control the intensity of the speed adjustment using the "Speed Adjustment Factor" slider.
     Format: `HH:MM:SS,milliseconds "VoicePrefix Text" more text "AnotherVoicePrefix More Text"`
             gr.Audio(label="Generated Audio", type="filepath"),
             gr.Markdown(label="Warning", visible=False)
         ],
+        title="TTS with Line-Wide Duration Adjustment and In-Quote Voice Switching",
         description=description,
         analytics_enabled=False,
         allow_flagging=False