cnph001 commited on
Commit
27bebc1
·
verified ·
1 Parent(s): ba52a5d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +144 -140
app.py CHANGED
@@ -9,114 +9,82 @@ from pathlib import Path
9
  from pydub import AudioSegment
10
 
11
  def get_silence(duration_ms=1000):
12
- # Create silent audio segment with specified parameters
13
- silent_audio = AudioSegment.silent(
14
- duration=duration_ms,
15
- frame_rate=24000 # 24kHz sampling rate
16
- )
17
-
18
- # Set audio parameters
19
- silent_audio = silent_audio.set_channels(1) # Mono
20
- silent_audio = silent_audio.set_sample_width(4) # 32-bit (4 bytes per sample)
21
-
22
- with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
23
- # Export with specific bitrate and codec parameters
24
- silent_audio.export(
25
- tmp_file.name,
26
- format="mp3",
27
- bitrate="48k",
28
- parameters=[
29
- "-ac", "1", # Mono
30
- "-ar", "24000", # Sample rate
31
- "-sample_fmt", "s32", # 32-bit samples
32
- "-codec:a", "libmp3lame" # MP3 codec
33
- ]
34
- )
35
- return tmp_file.name
36
 
37
  # Get all available voices
38
  async def get_voices():
39
- voices = await edge_tts.list_voices()
40
- return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
41
-
42
- async def process_transcript_line(line, voice, rate, pitch):
43
- """Processes a single transcript line to extract time, voice commands, and generate audio."""
44
- match = re.match(r'(\d+):(\d+)(?:\.(\d+))?\s+(.*)', line)
45
- if match:
46
- minutes, seconds, milliseconds_str, text_with_commands = match.groups()
47
- start_time_ms = int(minutes) * 60000 + int(seconds) * 1000 + (int(milliseconds_str) * 10 if milliseconds_str else 0)
48
- if not text_with_commands.strip():
49
- return start_time_ms, None
50
-
51
- current_voice_full = voice # Store the full voice string from the dropdown
52
- current_voice_short = current_voice_full.split(" - ")[0] if current_voice_full else ""
53
- current_rate = rate
54
- current_pitch = pitch
55
- processed_text = text_with_commands
56
-
57
- voice1_full = "en-AU-WilliamNeural - en-AU (Male)"
58
- voice1_short = voice1_full.split(" - ")[0]
59
- voice1F_full ="en-GB-SoniaNeural - en-GB (Female)"
60
- voice1F_short = voice1F_full.split(" - ")[0]
61
- voice2_full = "en-GB-RyanNeural - en-GB (Male)"
62
- voice2_short = voice2_full.split(" - ")[0]
63
- voice2F_full = "en-US-JennyNeural - en-US (Female)"
64
- voice2F_short = voice2F_full.split(" - ")[0]
65
- voice3_full ="en-US-BrianMultilingualNeural - en-US (Male)" #good for reading
66
- voice3_short = voice3_full.split(" - ")[0]
67
- voice3F_full = "en-HK-YanNeural - en-HK (Female)"
68
- voice3F_short = voice3F_full.split(" - ")[0]
69
- voice4_full = "en-GB-ThomasNeural - en-GB (Male)"
70
- voice4_short = voice4_full.split(" - ")[0]
71
- voice4F_full ="en-US-EmmaNeural - en-US (Female)"
72
- voice4F_short = voice4F_full.split(" - ")[0]
73
- voice5_full = "en-GB-RyanNeural - en-GB (Male)" #Old Man
74
- voice5_short = voice5_full.split(" - ")[0]
75
- voice6_full = "en-GB-MaisieNeural - en-GB (Female)" #Child
76
- voice6_short = voice6_full.split(" - ")[0]
77
-
78
- if text_with_commands.startswith("1F"):
79
- current_voice_short = voice1F_short
80
- current_pitch = 25
81
- processed_text = text_with_commands[2:].strip()
82
- elif text_with_commands.startswith("2F"):
83
- current_voice_short = voice2F_short
84
- processed_text = text_with_commands[2:].strip()
85
- elif text_with_commands.startswith("3F"):
86
- current_voice_short = voice3F_short
87
- processed_text = text_with_commands[2:].strip()
88
- elif text_with_commands.startswith("4F"):
89
- current_voice_short = voice4F_short
90
- processed_text = text_with_commands[2:].strip()
91
- elif text_with_commands.startswith("1M"):
92
- current_voice_short = voice1_short
93
- processed_text = text_with_commands[2:].strip()
94
- elif text_with_commands.startswith("2M"):
95
- current_voice_short = voice2_short
96
- processed_text = text_with_commands[2:].strip()
97
- elif text_with_commands.startswith("3M"):
98
- current_voice_short = voice3_short
99
- processed_text = text_with_commands[2:].strip()
100
- elif text_with_commands.startswith("4M"):
101
- current_voice_short = voice4_short
102
- processed_text = text_with_commands[2:].strip()
103
- elif text_with_commands.startswith("1O"): # Old man voice
104
- current_voice_short = voice5_short
105
- current_pitch = -20
106
- current_rate = -10
107
- processed_text = text_with_commands[2:].strip()
108
- elif text_with_commands.startswith("1C"): #Child voice
109
- current_voice_short = voice6_short
110
- processed_text = text_with_commands[2:].strip()
111
-
112
- rate_str = f"{current_rate:+d}%"
113
- pitch_str = f"{current_pitch:+d}Hz"
114
- communicate = edge_tts.Communicate(processed_text, current_voice_short, rate=rate_str, pitch=pitch_str)
115
- with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
116
- audio_path = tmp_file.name
117
- await communicate.save(audio_path)
118
- return start_time_ms, audio_path
119
- return None, None
120
 
121
  async def transcript_to_speech(transcript_text, voice, rate, pitch):
122
  if not transcript_text.strip():
@@ -124,32 +92,67 @@ async def transcript_to_speech(transcript_text, voice, rate, pitch):
124
  if not voice:
125
  return None, gr.Warning("Please select a voice.")
126
 
127
- lines = transcript_text.strip().split('\n')
128
- audio_segments_with_time = []
129
- max_end_time_ms = 0
130
-
131
- for line in lines:
132
- start_time, audio_path = await process_transcript_line(line, voice, rate, pitch)
133
- if start_time is not None and audio_path:
134
- audio = AudioSegment.from_mp3(audio_path)
135
- audio_segments_with_time.append({'start': start_time, 'audio': audio, 'path': audio_path})
136
- max_end_time_ms = max(max_end_time_ms, start_time + len(audio))
137
- elif audio_path:
138
- os.remove(audio_path) # Clean up even if no timestamp
139
-
140
- if not audio_segments_with_time:
141
- return None, "No valid transcript lines found."
142
-
143
- # Create initial silence audio
144
- final_audio = AudioSegment.silent(duration=max_end_time_ms, frame_rate=24000)
145
-
146
- for segment in audio_segments_with_time:
147
- final_audio = final_audio.overlay(segment['audio'], position=segment['start'])
148
- os.remove(segment['path']) # Clean up individual audio files
149
-
150
- combined_audio_path = tempfile.mktemp(suffix=".mp3")
151
- final_audio.export(combined_audio_path, format="mp3")
152
- return combined_audio_path, None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
 
154
  @spaces.GPU
155
  def tts_interface(transcript, voice, rate, pitch):
@@ -160,21 +163,22 @@ async def create_demo():
160
  voices = await get_voices()
161
  default_voice = "en-US-AndrewMultilingualNeural - en-US (Male)"
162
  description = """
163
- Process YouTube transcript text with timestamps to generate synchronized audio.
164
- Each line should be in the format: `minutes:seconds[.milliseconds] text`.
165
- Voice prefixes (e.g., 1F, 1C) can be used at the beginning of a line to switch voices.
166
  Example:
167
  ```
168
- 0:00 This
169
- 0:14 is the story of little Red Riding Hood
170
- 0:38 1F Grandma isn’t feeling very well.
171
- 0:48 1C Yes, said Little Red Riding Hood.
 
172
  ```
173
  """
174
  demo = gr.Interface(
175
  fn=tts_interface,
176
  inputs=[
177
- gr.Textbox(label="YouTube Transcript", lines=10, placeholder="0:00 This\n0:14 is the story...\n0:38 1F Grandma..."),
178
  gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice", value=default_voice),
179
  gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1),
180
  gr.Slider(minimum=-50, maximum=50, value=0, label="Pitch Adjustment (Hz)", step=1)
@@ -183,7 +187,7 @@ async def create_demo():
183
  gr.Audio(label="Generated Audio", type="filepath"),
184
  gr.Markdown(label="Warning", visible=False)
185
  ],
186
- title="TTS for YouTube Transcripts with Voice Switching",
187
  description=description,
188
  analytics_enabled=False,
189
  allow_flagging=False
 
9
  from pydub import AudioSegment
10
 
11
  def get_silence(duration_ms=1000):
12
+ # ... (get_silence function remains the same)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
  # Get all available voices
15
  async def get_voices():
16
+ # ... (get_voices function remains the same)
17
+
18
+ async def text_to_speech_segment(text_segment, voice, rate, pitch):
19
+ """Processes a single text segment for voice commands and generates audio."""
20
+ current_voice_full = voice
21
+ current_voice_short = current_voice_full.split(" - ")[0] if current_voice_full else ""
22
+ current_rate = rate
23
+ current_pitch = pitch
24
+ processed_text = text_segment
25
+
26
+ voice1_full = "en-AU-WilliamNeural - en-AU (Male)"
27
+ voice1_short = voice1_full.split(" - ")[0]
28
+ voice1F_full ="en-GB-SoniaNeural - en-GB (Female)"
29
+ voice1F_short = voice1F_full.split(" - ")[0]
30
+ voice2_full = "en-GB-RyanNeural - en-GB (Male)"
31
+ voice2_short = voice2_full.split(" - ")[0]
32
+ voice2F_full = "en-US-JennyNeural - en-US (Female)"
33
+ voice2F_short = voice2F_full.split(" - ")[0]
34
+ voice3_full ="en-US-BrianMultilingualNeural - en-US (Male)" #good for reading
35
+ voice3_short = voice3_full.split(" - ")[0]
36
+ voice3F_full = "en-HK-YanNeural - en-HK (Female)"
37
+ voice3F_short = voice3F_full.split(" - ")[0]
38
+ voice4_full = "en-GB-ThomasNeural - en-GB (Male)"
39
+ voice4_short = voice4_full.split(" - ")[0]
40
+ voice4F_full ="en-US-EmmaNeural - en-US (Female)"
41
+ voice4F_short = voice4F_full.split(" - ")[0]
42
+ voice5_full = "en-GB-RyanNeural - en-GB (Male)" #Old Man
43
+ voice5_short = voice5_full.split(" - ")[0]
44
+ voice6_full = "en-GB-MaisieNeural - en-GB (Female)" #Child
45
+ voice6_short = voice6_full.split(" - ")[0]
46
+
47
+ if text_segment.startswith("1F"):
48
+ current_voice_short = voice1F_short
49
+ current_pitch = 25
50
+ processed_text = text_segment[2:].strip()
51
+ elif text_segment.startswith("2F"):
52
+ current_voice_short = voice2F_short
53
+ processed_text = text_segment[2:].strip()
54
+ elif text_segment.startswith("3F"):
55
+ current_voice_short = voice3F_short
56
+ processed_text = text_segment[2:].strip()
57
+ elif text_segment.startswith("4F"):
58
+ current_voice_short = voice4F_short
59
+ processed_text = text_segment[2:].strip()
60
+ elif text_segment.startswith("1M"):
61
+ current_voice_short = voice1_short
62
+ processed_text = text_segment[2:].strip()
63
+ elif text_segment.startswith("2M"):
64
+ current_voice_short = voice2_short
65
+ processed_text = text_segment[2:].strip()
66
+ elif text_segment.startswith("3M"):
67
+ current_voice_short = voice3_short
68
+ processed_text = text_segment[2:].strip()
69
+ elif text_segment.startswith("4M"):
70
+ current_voice_short = voice4_short
71
+ processed_text = text_segment[2:].strip()
72
+ elif text_segment.startswith("1O"): # Old man voice
73
+ current_voice_short = voice5_short
74
+ current_pitch = -20
75
+ current_rate = -10
76
+ processed_text = text_segment[2:].strip()
77
+ elif text_segment.startswith("1C"): #Child voice
78
+ current_voice_short = voice6_short
79
+ processed_text = text_segment[2:].strip()
80
+
81
+ rate_str = f"{current_rate:+d}%"
82
+ pitch_str = f"{current_pitch:+d}Hz"
83
+ communicate = edge_tts.Communicate(processed_text, current_voice_short, rate=rate_str, pitch=pitch_str)
84
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
85
+ audio_path = tmp_file.name
86
+ await communicate.save(audio_path)
87
+ return audio_path
 
 
 
 
 
 
 
 
 
88
 
89
  async def transcript_to_speech(transcript_text, voice, rate, pitch):
90
  if not transcript_text.strip():
 
92
  if not voice:
93
  return None, gr.Warning("Please select a voice.")
94
 
95
+ segments = re.split(r'[“”"]', transcript_text)
96
+ audio_paths = []
97
+
98
+ for segment in segments:
99
+ segment = segment.strip()
100
+ if segment:
101
+ # Check if the segment starts with a timestamp
102
+ timestamp_match = re.match(r'(\d+):(\d+)(?:\.(\d+))?\s+(.*)', segment)
103
+ if timestamp_match:
104
+ minutes, seconds, milliseconds_str, text_with_commands = timestamp_match.groups()
105
+ start_time_ms = int(minutes) * 60000 + int(seconds) * 1000 + (int(milliseconds_str) * 10 if milliseconds_str else 0)
106
+ audio_path = await text_to_speech_segment(text_with_commands, voice, rate, pitch)
107
+ audio_paths.append({'start': start_time_ms, 'path': audio_path})
108
+ else:
109
+ # Process segments without timestamps (for voice switching)
110
+ audio_path = await text_to_speech_segment(segment, voice, rate, pitch)
111
+ if audio_path:
112
+ audio_paths.append({'start': None, 'path': audio_path}) # No specific start time
113
+
114
+ if not audio_paths:
115
+ return None, "No audio segments generated."
116
+
117
+ # Handle combining audio with timestamps
118
+ timed_segments = [item for item in audio_paths if item['start'] is not None]
119
+ non_timed_segments = [item for item in audio_paths if item['start'] is None and item['path']]
120
+
121
+ if timed_segments:
122
+ max_end_time_ms = 0
123
+ processed_timed_segments = []
124
+ for item in timed_segments:
125
+ audio = AudioSegment.from_mp3(item['path'])
126
+ processed_timed_segments.append({'start': item['start'], 'audio': audio, 'path': item['path']})
127
+ max_end_time_ms = max(max_end_time_ms, item['start'] + len(audio))
128
+
129
+ final_audio = AudioSegment.silent(duration=max_end_time_ms, frame_rate=24000)
130
+ for segment in processed_timed_segments:
131
+ final_audio = final_audio.overlay(segment['audio'], position=segment['start'])
132
+ os.remove(segment['path'])
133
+
134
+ # Append non-timed segments sequentially
135
+ for item in non_timed_segments:
136
+ audio = AudioSegment.from_mp3(item['path'])
137
+ final_audio += audio
138
+ os.remove(item['path'])
139
+
140
+ combined_audio_path = tempfile.mktemp(suffix=".mp3")
141
+ final_audio.export(combined_audio_path, format="mp3")
142
+ return combined_audio_path, None
143
+
144
+ elif non_timed_segments:
145
+ # Combine non-timed segments sequentially if no timestamps are found
146
+ combined_audio = AudioSegment.empty()
147
+ for item in non_timed_segments:
148
+ audio = AudioSegment.from_mp3(item['path'])
149
+ combined_audio += audio
150
+ os.remove(item['path'])
151
+ combined_audio_path = tempfile.mktemp(suffix=".mp3")
152
+ combined_audio.export(combined_audio_path, format="mp3")
153
+ return combined_audio_path, None
154
+
155
+ return None, "No processable audio segments found."
156
 
157
  @spaces.GPU
158
  def tts_interface(transcript, voice, rate, pitch):
 
163
  voices = await get_voices()
164
  default_voice = "en-US-AndrewMultilingualNeural - en-US (Male)"
165
  description = """
166
+ Process text, handling both timestamped transcripts and voice switching using quote marks and prefixes.
167
+ Separate segments by quote marks ("). For timestamped segments, use the format: `minutes:seconds[.milliseconds] text`.
168
+ Voice prefixes (e.g., 1F, 1C) can be used at the beginning of a quoted segment to switch voices.
169
  Example:
170
  ```
171
+ 0:00 "This"
172
+ "0:14 is the story of little Red Riding Hood"
173
+ "0:38 1F Grandma isn’t feeling very well."
174
+ "0:48 1C Yes, said Little Red Riding Hood."
175
+ "and then the default voice continues"
176
  ```
177
  """
178
  demo = gr.Interface(
179
  fn=tts_interface,
180
  inputs=[
181
+ gr.Textbox(label="Input Text / Transcript", lines=10, placeholder='0:00 "This"\n"0:14 is the story..."\n"1F Hello"'),
182
  gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice", value=default_voice),
183
  gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1),
184
  gr.Slider(minimum=-50, maximum=50, value=0, label="Pitch Adjustment (Hz)", step=1)
 
187
  gr.Audio(label="Generated Audio", type="filepath"),
188
  gr.Markdown(label="Warning", visible=False)
189
  ],
190
+ title="Combined TTS: Timestamps and Voice Switching",
191
  description=description,
192
  analytics_enabled=False,
193
  allow_flagging=False