Edge_TTS_NGHIA_transcript

Running

App Files Files Community

cnph001 commited on May 13

Commit

3229678

verified ·

1 Parent(s): a6eabef

Update app.py

Browse files

Files changed (1) hide show

app.py +40 -13

app.py CHANGED Viewed

@@ -202,10 +202,10 @@ async def transcript_to_speech(transcript_text, voice, rate, pitch):
     timed_audio_segments = []
     max_end_time_ms = 0
     previous_end_time_ms = 0
-    previous_start_time_ms = 0
-    for i, line in enumerate(lines):
-        start_time, audio_paths = await process_transcript_line(line, voice, rate, pitch)
         if start_time is not None and audio_paths:
             combined_line_audio = AudioSegment.empty()
             for path in audio_paths:
@@ -216,17 +216,43 @@ async def transcript_to_speech(transcript_text, voice, rate, pitch):
                 except FileNotFoundError:
                     print(f"Warning: Audio file not found: {path}")
-            if combined_line_audio:
-                current_audio_duration = len(combined_line_audio)
-                intended_start_time = start_time
-                if i > 0:
-                    time_difference = start_time - previous_start_time_ms
-                    if current_audio_duration > time_difference:
-                        intended_start_time = previous_end_time_ms
                 timed_audio_segments.append({'start': intended_start_time, 'audio': combined_line_audio})
-                previous_start_time_ms = start_time
                 previous_end_time_ms = max(previous_end_time_ms, intended_start_time + current_audio_duration)
                 max_end_time_ms = max(max_end_time_ms, previous_end_time_ms)
         elif audio_paths:
@@ -235,6 +261,7 @@ async def transcript_to_speech(transcript_text, voice, rate, pitch):
                     os.remove(path)
                 except FileNotFoundError:
                     pass # Clean up even if no timestamp
     if not timed_audio_segments:
         return None, "No processable audio segments found."

     timed_audio_segments = []
     max_end_time_ms = 0
     previous_end_time_ms = 0
+    i = 0
+    while i < len(lines):
+        start_time, audio_paths = await process_transcript_line(lines[i], voice, rate, pitch)
         if start_time is not None and audio_paths:
             combined_line_audio = AudioSegment.empty()
             for path in audio_paths:
                 except FileNotFoundError:
                     print(f"Warning: Audio file not found: {path}")
+            current_audio_duration = len(combined_line_audio)
+            intended_start_time = start_time
+            # Check duration until the next timestamp
+            if i + 1 < len(lines):
+                next_start_time = int(lines[i+1].split(',')[0].replace(':', '')) * 1000
+                next_start_time_ms = (next_start_time // 1000000 * 3600000) + ((next_start_time % 1000000) // 10000 * 60000) + ((next_start_time % 1000000) % 10000 // 100) * 1000 + (next_start_time % 1000000) % 10000 % 100
+                duration_to_next = next_start_time_ms - start_time
+                if current_audio_duration > duration_to_next:
+                    # Hold and append audio from subsequent lines
+                    j = i + 1
+                    while j < len(lines):
+                        next_start_time, next_audio_paths = await process_transcript_line(lines[j], voice, rate, pitch)
+                        if next_start_time is not None and next_audio_paths:
+                            for next_path in next_audio_paths:
+                                try:
+                                    next_audio = AudioSegment.from_mp3(next_path)
+                                    combined_line_audio += next_audio
+                                    os.remove(next_path)
+                                except FileNotFoundError:
+                                    print(f"Warning: Audio file not found: {next_path}")
+                            current_audio_duration = len(combined_line_audio)
+                            #check duration to the next timestamp.
+                            if j + 1 < len(lines):
+                                next_start_time_2 = int(lines[j+1].split(',')[0].replace(':', '')) * 1000
+                                next_start_time_ms_2 = (next_start_time_2 // 1000000 * 3600000) + ((next_start_time_2 % 1000000) // 10000 * 60000 ) + ((next_start_time_2 % 1000000) % 10000 // 100) * 1000 + (next_start_time_2 % 1000000) % 10000 % 100
+                                duration_to_next_2 = next_start_time_ms_2 - start_time
+                                if current_audio_duration <= duration_to_next_2:
+                                    break
+                            j += 1
+                        else:
+                            break
+                    i = j #update i to j
                 timed_audio_segments.append({'start': intended_start_time, 'audio': combined_line_audio})
                 previous_end_time_ms = max(previous_end_time_ms, intended_start_time + current_audio_duration)
                 max_end_time_ms = max(max_end_time_ms, previous_end_time_ms)
         elif audio_paths:
                     os.remove(path)
                 except FileNotFoundError:
                     pass # Clean up even if no timestamp
+        i += 1
     if not timed_audio_segments:
         return None, "No processable audio segments found."