cnph001 commited on
Commit
ba3a67a
·
verified ·
1 Parent(s): eae282d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +102 -103
app.py CHANGED
@@ -39,13 +39,13 @@ async def get_voices():
39
  voices = await edge_tts.list_voices()
40
  return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
41
 
42
- async def text_to_speech_segment(text_segment, voice, rate, pitch):
43
- """Processes a single text segment for voice commands and generates audio."""
44
- current_voice_full = voice
45
  current_voice_short = current_voice_full.split(" - ")[0] if current_voice_full else ""
46
  current_rate = rate
47
  current_pitch = pitch
48
- processed_text = text_segment
49
 
50
  voice1_full = "en-AU-WilliamNeural - en-AU (Male)"
51
  voice1_short = voice1_full.split(" - ")[0]
@@ -68,47 +68,75 @@ async def text_to_speech_segment(text_segment, voice, rate, pitch):
68
  voice6_full = "en-GB-MaisieNeural - en-GB (Female)" #Child
69
  voice6_short = voice6_full.split(" - ")[0]
70
 
71
- if text_segment.startswith("1F"):
72
  current_voice_short = voice1F_short
73
  current_pitch = 25
74
- processed_text = text_segment[2:].strip()
75
- elif text_segment.startswith("2F"):
76
  current_voice_short = voice2F_short
77
- processed_text = text_segment[2:].strip()
78
- elif text_segment.startswith("3F"):
79
  current_voice_short = voice3F_short
80
- processed_text = text_segment[2:].strip()
81
- elif text_segment.startswith("4F"):
82
  current_voice_short = voice4F_short
83
- processed_text = text_segment[2:].strip()
84
- elif text_segment.startswith("1M"):
85
  current_voice_short = voice1_short
86
- processed_text = text_segment[2:].strip()
87
- elif text_segment.startswith("2M"):
88
  current_voice_short = voice2_short
89
- processed_text = text_segment[2:].strip()
90
- elif text_segment.startswith("3M"):
91
  current_voice_short = voice3_short
92
- processed_text = text_segment[2:].strip()
93
- elif text_segment.startswith("4M"):
94
  current_voice_short = voice4_short
95
- processed_text = text_segment[2:].strip()
96
- elif text_segment.startswith("1O"): # Old man voice
97
  current_voice_short = voice5_short
98
  current_pitch = -20
99
  current_rate = -10
100
- processed_text = text_segment[2:].strip()
101
- elif text_segment.startswith("1C"): #Child voice
102
  current_voice_short = voice6_short
103
- processed_text = text_segment[2:].strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
 
105
- rate_str = f"{current_rate:+d}%"
106
- pitch_str = f"{current_pitch:+d}Hz"
107
- communicate = edge_tts.Communicate(processed_text, current_voice_short, rate=rate_str, pitch=pitch_str)
108
- with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
109
- audio_path = tmp_file.name
110
- await communicate.save(audio_path)
111
- return audio_path
112
 
113
  async def transcript_to_speech(transcript_text, voice, rate, pitch):
114
  if not transcript_text.strip():
@@ -116,67 +144,42 @@ async def transcript_to_speech(transcript_text, voice, rate, pitch):
116
  if not voice:
117
  return None, gr.Warning("Please select a voice.")
118
 
119
- segments = re.split(r'[“”"]', transcript_text)
120
- audio_paths = []
121
-
122
- for segment in segments:
123
- segment = segment.strip()
124
- if segment:
125
- # Check if the segment starts with a timestamp
126
- timestamp_match = re.match(r'(\d+):(\d+)(?:\.(\d+))?\s+(.*)', segment)
127
- if timestamp_match:
128
- minutes, seconds, milliseconds_str, text_with_commands = timestamp_match.groups()
129
- start_time_ms = int(minutes) * 60000 + int(seconds) * 1000 + (int(milliseconds_str) * 10 if milliseconds_str else 0)
130
- audio_path = await text_to_speech_segment(text_with_commands, voice, rate, pitch)
131
- audio_paths.append({'start': start_time_ms, 'path': audio_path})
132
- else:
133
- # Process segments without timestamps (for voice switching)
134
- audio_path = await text_to_speech_segment(segment, voice, rate, pitch)
135
- if audio_path:
136
- audio_paths.append({'start': None, 'path': audio_path}) # No specific start time
137
-
138
- if not audio_paths:
139
- return None, "No audio segments generated."
140
-
141
- # Handle combining audio with timestamps
142
- timed_segments = [item for item in audio_paths if item['start'] is not None]
143
- non_timed_segments = [item for item in audio_paths if item['start'] is None and item['path']]
144
-
145
- if timed_segments:
146
- max_end_time_ms = 0
147
- processed_timed_segments = []
148
- for item in timed_segments:
149
- audio = AudioSegment.from_mp3(item['path'])
150
- processed_timed_segments.append({'start': item['start'], 'audio': audio, 'path': item['path']})
151
- max_end_time_ms = max(max_end_time_ms, item['start'] + len(audio))
152
-
153
- final_audio = AudioSegment.silent(duration=max_end_time_ms, frame_rate=24000)
154
- for segment in processed_timed_segments:
155
- final_audio = final_audio.overlay(segment['audio'], position=segment['start'])
156
- os.remove(segment['path'])
157
-
158
- # Append non-timed segments sequentially
159
- for item in non_timed_segments:
160
- audio = AudioSegment.from_mp3(item['path'])
161
- final_audio += audio
162
- os.remove(item['path'])
163
-
164
- combined_audio_path = tempfile.mktemp(suffix=".mp3")
165
- final_audio.export(combined_audio_path, format="mp3")
166
- return combined_audio_path, None
167
-
168
- elif non_timed_segments:
169
- # Combine non-timed segments sequentially if no timestamps are found
170
- combined_audio = AudioSegment.empty()
171
- for item in non_timed_segments:
172
- audio = AudioSegment.from_mp3(item['path'])
173
- combined_audio += audio
174
- os.remove(item['path'])
175
- combined_audio_path = tempfile.mktemp(suffix=".mp3")
176
- combined_audio.export(combined_audio_path, format="mp3")
177
- return combined_audio_path, None
178
-
179
- return None, "No processable audio segments found."
180
 
181
  @spaces.GPU
182
  def tts_interface(transcript, voice, rate, pitch):
@@ -187,23 +190,19 @@ async def create_demo():
187
  voices = await get_voices()
188
  default_voice = "en-US-AndrewMultilingualNeural - en-US (Male)"
189
  description = """
190
- Process text, handling both timestamped transcripts and voice switching using quote marks and prefixes.
191
- Separate segments by quote marks ("). For timestamped segments, use the format: `minutes:seconds[.milliseconds] text`.
192
- Voice prefixes (e.g., 1F, 1C) can be used at the beginning of a quoted segment to switch voices.
193
  Example:
194
  ```
195
- 0:00 "This"
196
- "0:14 is the story of little Red Riding Hood"
197
- "0:38 1F Grandma isn’t feeling very well."
198
- "0:48 1C Yes, said Little Red Riding Hood."
199
- "and then the default voice continues"
200
  ```
201
  """
202
  demo = gr.Interface(
203
  fn=tts_interface,
204
  inputs=[
205
- gr.Textbox(label="Input Text / Transcript", lines=10, placeholder='0:00 "This"\n"0:14 is the story..."\n"1F Hello"'),
206
- gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice", value=default_voice),
207
  gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1),
208
  gr.Slider(minimum=-50, maximum=50, value=0, label="Pitch Adjustment (Hz)", step=1)
209
  ],
@@ -211,7 +210,7 @@ async def create_demo():
211
  gr.Audio(label="Generated Audio", type="filepath"),
212
  gr.Markdown(label="Warning", visible=False)
213
  ],
214
- title="Combined TTS: Timestamps and Voice Switching",
215
  description=description,
216
  analytics_enabled=False,
217
  allow_flagging=False
 
39
  voices = await edge_tts.list_voices()
40
  return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
41
 
42
+ async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pitch):
43
+ """Generates audio for a text segment, handling voice prefixes."""
44
+ current_voice_full = default_voice
45
  current_voice_short = current_voice_full.split(" - ")[0] if current_voice_full else ""
46
  current_rate = rate
47
  current_pitch = pitch
48
+ processed_text = text_segment.strip()
49
 
50
  voice1_full = "en-AU-WilliamNeural - en-AU (Male)"
51
  voice1_short = voice1_full.split(" - ")[0]
 
68
  voice6_full = "en-GB-MaisieNeural - en-GB (Female)" #Child
69
  voice6_short = voice6_full.split(" - ")[0]
70
 
71
+ if processed_text.startswith("1F"):
72
  current_voice_short = voice1F_short
73
  current_pitch = 25
74
+ processed_text = processed_text[2:].strip()
75
+ elif processed_text.startswith("2F"):
76
  current_voice_short = voice2F_short
77
+ processed_text = processed_text[2:].strip()
78
+ elif processed_text.startswith("3F"):
79
  current_voice_short = voice3F_short
80
+ processed_text = processed_text[2:].strip()
81
+ elif processed_text.startswith("4F"):
82
  current_voice_short = voice4F_short
83
+ processed_text = processed_text[2:].strip()
84
+ elif processed_text.startswith("1M"):
85
  current_voice_short = voice1_short
86
+ processed_text = processed_text[2:].strip()
87
+ elif processed_text.startswith("2M"):
88
  current_voice_short = voice2_short
89
+ processed_text = processed_text[2:].strip()
90
+ elif processed_text.startswith("3M"):
91
  current_voice_short = voice3_short
92
+ processed_text = processed_text[2:].strip()
93
+ elif processed_text.startswith("4M"):
94
  current_voice_short = voice4_short
95
+ processed_text = processed_text[2:].strip()
96
+ elif processed_text.startswith("1O"): # Old man voice
97
  current_voice_short = voice5_short
98
  current_pitch = -20
99
  current_rate = -10
100
+ processed_text = processed_text[2:].strip()
101
+ elif processed_text.startswith("1C"): #Child voice
102
  current_voice_short = voice6_short
103
+ processed_text = processed_text[2:].strip()
104
+
105
+ if processed_text:
106
+ rate_str = f"{current_rate:+d}%"
107
+ pitch_str = f"{current_pitch:+d}Hz"
108
+ communicate = edge_tts.Communicate(processed_text, current_voice_short, rate=rate_str, pitch=pitch_str)
109
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
110
+ audio_path = tmp_file.name
111
+ await communicate.save(audio_path)
112
+ return audio_path
113
+ return None
114
+
115
+ async def process_transcript_line(line, default_voice, rate, pitch):
116
+ """Processes a single transcript line with timestamp and quoted text segments."""
117
+ match = re.match(r'(\d+):(\d+)(?:\.(\d+))?\s+(.*)', line)
118
+ if match:
119
+ minutes, seconds, milliseconds_str, text_parts = match.groups()
120
+ start_time_ms = int(minutes) * 60000 + int(seconds) * 1000 + (int(milliseconds_str) * 10 if milliseconds_str else 0)
121
+ audio_segments = []
122
+ split_parts = re.split(r'(")', text_parts) # Split by quote marks, keeping the quotes
123
+
124
+ process_next = False
125
+ for part in split_parts:
126
+ if part == '"':
127
+ process_next = not process_next
128
+ continue
129
+ if process_next and part.strip():
130
+ audio_path = await generate_audio_with_voice_prefix(part, default_voice, rate, pitch)
131
+ if audio_path:
132
+ audio_segments.append(audio_path)
133
+ elif not process_next and part.strip():
134
+ audio_path = await generate_audio_with_voice_prefix(part, default_voice, rate, pitch) # Process unquoted text with default voice
135
+ if audio_path:
136
+ audio_segments.append(audio_path)
137
 
138
+ return start_time_ms, audio_segments
139
+ return None, None
 
 
 
 
 
140
 
141
  async def transcript_to_speech(transcript_text, voice, rate, pitch):
142
  if not transcript_text.strip():
 
144
  if not voice:
145
  return None, gr.Warning("Please select a voice.")
146
 
147
+ lines = transcript_text.strip().split('\n')
148
+ timed_audio_segments = []
149
+ max_end_time_ms = 0
150
+
151
+ for line in lines:
152
+ start_time, audio_paths = await process_transcript_line(line, voice, rate, pitch)
153
+ if start_time is not None and audio_paths:
154
+ combined_line_audio = AudioSegment.empty()
155
+ for path in audio_paths:
156
+ try:
157
+ audio = AudioSegment.from_mp3(path)
158
+ combined_line_audio += audio
159
+ os.remove(path)
160
+ except FileNotFoundError:
161
+ print(f"Warning: Audio file not found: {path}")
162
+
163
+ if combined_line_audio:
164
+ timed_audio_segments.append({'start': start_time, 'audio': combined_line_audio})
165
+ max_end_time_ms = max(max_end_time_ms, start_time + len(combined_line_audio))
166
+ elif audio_paths:
167
+ for path in audio_paths:
168
+ try:
169
+ os.remove(path)
170
+ except FileNotFoundError:
171
+ pass # Clean up even if no timestamp
172
+
173
+ if not timed_audio_segments:
174
+ return None, "No processable audio segments found."
175
+
176
+ final_audio = AudioSegment.silent(duration=max_end_time_ms, frame_rate=24000)
177
+ for segment in timed_audio_segments:
178
+ final_audio = final_audio.overlay(segment['audio'], position=segment['start'])
179
+
180
+ combined_audio_path = tempfile.mktemp(suffix=".mp3")
181
+ final_audio.export(combined_audio_path, format="mp3")
182
+ return combined_audio_path, None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
 
184
  @spaces.GPU
185
  def tts_interface(transcript, voice, rate, pitch):
 
190
  voices = await get_voices()
191
  default_voice = "en-US-AndrewMultilingualNeural - en-US (Male)"
192
  description = """
193
+ Process timestamped text with voice changes within quotes.
194
+ Format: `minutes:seconds[.milliseconds] "VoicePrefix Text" more text "AnotherVoicePrefix More Text"`
 
195
  Example:
196
  ```
197
+ 0:00 "This is the default voice." more default. "1F Now a female voice." and back to default.
198
+ 0:05 "1C Yes," said the child, "it is fun!"
 
 
 
199
  ```
200
  """
201
  demo = gr.Interface(
202
  fn=tts_interface,
203
  inputs=[
204
+ gr.Textbox(label="Timestamped Text with Voice Changes", lines=10, placeholder='0:00 "Text" more text "1F Different Voice"'),
205
+ gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Default Voice", value=default_voice),
206
  gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1),
207
  gr.Slider(minimum=-50, maximum=50, value=0, label="Pitch Adjustment (Hz)", step=1)
208
  ],
 
210
  gr.Audio(label="Generated Audio", type="filepath"),
211
  gr.Markdown(label="Warning", visible=False)
212
  ],
213
+ title="TTS with Timestamp and In-Quote Voice Switching",
214
  description=description,
215
  analytics_enabled=False,
216
  allow_flagging=False