Edge_TTS_NGHIA_transcript

Running

App Files Files Community

cnph001 commited on May 12

Commit

218e261

verified ·

1 Parent(s): 0d2cfad

Update app.py

Browse files

Files changed (1) hide show

app.py +38 -7

app.py CHANGED Viewed

@@ -54,7 +54,7 @@ async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pi
     current_rate = rate
     current_pitch = pitch
     processed_text = text_segment.strip()
-    print(f"Processing this  text segment: {processed_text}") # Debug
     voice_map = {
         "1F": "en-GB-SoniaNeural",
         "2M": "en-GB-RyanNeural",
@@ -102,12 +102,23 @@ async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pi
             with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
                 audio_path = tmp_file.name
                 await communicate.save(audio_path)
-            if overall_target_duration_ms is not None and os.path.exists(audio_path) and overall_target_duration_ms > 0:
                 audio = AudioSegment.from_mp3(audio_path)
-                audio_duration_ms = len(audio)
-                # We don't do the stretching here anymore for individual segments
-            return audio_path
         except Exception as e:
             print(f"Edge TTS error processing '{processed_text}': {e}")
             return None
@@ -124,7 +135,6 @@ async def process_transcript_line(line, next_line_start_time, default_voice, rat
             int(start_s) * 1000 +
             int(start_ms)
         )
         audio_segments = []
         split_parts = re.split(r'[“”"]', text_parts)
         process_next = False
@@ -140,7 +150,28 @@ async def process_transcript_line(line, next_line_start_time, default_voice, rat
                 audio_path = await generate_audio_with_voice_prefix(part, default_voice, rate, pitch, overall_duration_ms, speed_adjustment_factor)
                 if audio_path:
                     audio_segments.append(audio_path)
-        return start_time_ms, audio_segments, overall_duration_ms
     return None, None, None
 async def transcript_to_speech(transcript_text, voice, rate, pitch, speed_adjustment_factor):

     current_rate = rate
     current_pitch = pitch
     processed_text = text_segment.strip()
+    print(f"Processing this  text segment: '{processed_text}'") # Debug
     voice_map = {
         "1F": "en-GB-SoniaNeural",
         "2M": "en-GB-RyanNeural",
             with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
                 audio_path = tmp_file.name
                 await communicate.save(audio_path)
+            if os.path.exists(audio_path):
                 audio = AudioSegment.from_mp3(audio_path)
+                # Trim leading and trailing silence
+                def detect_leading_silence(sound, silence_threshold=-50.0, chunk_size=10):
+                    trim_ms = 0
+                    assert chunk_size > 0 # to avoid infinite loop
+                    while sound[trim_ms:trim_ms+chunk_size].dBFS < silence_threshold and trim_ms < len(sound):
+                        trim_ms += chunk_size
+                    return trim_ms
+                start_trim = detect_leading_silence(audio)
+                end_trim = detect_leading_silence(audio.reverse())
+                trimmed_audio = audio[start_trim:len(audio)-end_trim]
+                trimmed_audio.export(audio_path, format="mp3") # Overwrite with trimmed version
+                return audio_path
         except Exception as e:
             print(f"Edge TTS error processing '{processed_text}': {e}")
             return None
             int(start_s) * 1000 +
             int(start_ms)
         )
         audio_segments = []
         split_parts = re.split(r'[“”"]', text_parts)
         process_next = False
                 audio_path = await generate_audio_with_voice_prefix(part, default_voice, rate, pitch, overall_duration_ms, speed_adjustment_factor)
                 if audio_path:
                     audio_segments.append(audio_path)
+        if audio_segments:
+            combined_audio = AudioSegment.empty()
+            for segment_path in audio_segments:
+                try:
+                    segment = AudioSegment.from_mp3(segment_path)
+                    combined_audio += segment
+                    os.remove(segment_path) # Clean up individual segment files
+                except Exception as e:
+                    print(f"Error loading or combining audio segment {segment_path}: {e}")
+                    return None, None, None
+            combined_audio_path = f"combined_audio_{start_time_ms}.mp3"
+            try:
+                combined_audio.export(combined_audio_path, format="mp3")
+                return start_time_ms, [combined_audio_path], overall_duration_ms
+            except Exception as e:
+                print(f"Error exporting combined audio: {e}")
+                return None, None, None
+        return start_time_ms, [], overall_duration_ms # Return empty list if no audio generated
     return None, None, None
 async def transcript_to_speech(transcript_text, voice, rate, pitch, speed_adjustment_factor):