cnph001 commited on
Commit
7b3b340
·
verified ·
1 Parent(s): ff3ad52

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +204 -111
app.py CHANGED
@@ -1,5 +1,12 @@
1
- import soundfile as sf
2
- import numpy as np
 
 
 
 
 
 
 
3
 
4
  def get_silence(duration_ms=1000):
5
  # Create silent audio segment with specified parameters
@@ -7,9 +14,11 @@ def get_silence(duration_ms=1000):
7
  duration=duration_ms,
8
  frame_rate=24000 # 24kHz sampling rate
9
  )
 
10
  # Set audio parameters
11
  silent_audio = silent_audio.set_channels(1) # Mono
12
  silent_audio = silent_audio.set_sample_width(4) # 32-bit (4 bytes per sample)
 
13
  with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
14
  # Export with specific bitrate and codec parameters
15
  silent_audio.export(
@@ -27,124 +36,163 @@ def get_silence(duration_ms=1000):
27
 
28
  # Get all available voices
29
  async def get_voices():
 
 
30
 
31
- try:
32
- voices = await edge_tts.list_voices()
33
- return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
34
-
35
- async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pitch, target_duration_ms=None, speed_adjustment_factor=1.0):
36
- """Generates audio for a text segment, handling voice prefixes and adjusting rate for duration."""
37
  current_voice_full = default_voice
38
  current_voice_short = current_voice_full.split(" - ")[0] if current_voice_full else ""
39
  current_rate = rate
40
  current_pitch = pitch
41
  processed_text = text_segment.strip()
42
- print(f"Processing this text segment: {processed_text}") # Debug
43
- voice_map = {
44
- "1F": "en-GB-SoniaNeural",
45
- "2M": "en-GB-RyanNeural",
46
- "3M": "en-US-BrianMultilingualNeural",
47
- "2F": "en-US-JennyNeural",
48
- "1M": "en-AU-WilliamNeural",
49
- "3F": "en-HK-YanNeural",
50
- "4M": "en-GB-ThomasNeural",
51
- "4F": "en-US-EmmaNeural",
52
- "1O": "en-GB-RyanNeural", # Old Man
53
- "1C": "en-GB-MaisieNeural", # Child
54
- "1V": "vi-VN-HoaiMyNeural", # Vietnamese (Female)
55
- "2V": "vi-VN-NamMinhNeural", # Vietnamese (Male)
56
- "3V": "vi-VN-HoaiMyNeural", # Vietnamese (Female)
57
- "4V": "vi-VN-NamMinhNeural", # Vietnamese (Male)
58
- }
59
- detect = 0
60
- for prefix, voice_short in voice_map.items():
61
- if processed_text.startswith(prefix):
62
- current_voice_short = voice_short
63
- if prefix in ["1F", "3F", "1V", "3V"]:
64
- elif prefix in ["1O", "4V"]:
65
- current_pitch = -20
66
- current_rate = -10
67
- detect = 1
68
- processed_text = processed_text[len(prefix):].strip()
69
- break
70
-
71
- match = re.search(r'([A-Za-z]+)-?(\d+)', processed_text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
  if match:
73
- prefix_pitch = match.group(1)
74
- number = int(match.group(2))
75
- if prefix_pitch in voice_map:
76
- current_pitch += number
77
- processed_text = re.sub(r'[A-Za-z]+-?\d+', '', processed_text, count=1).strip()
78
- elif detect:
79
- processed_text = processed_text.lstrip('-0123456789').strip() # Remove potential leftover numbers
80
- elif detect:
81
- processed_text = processed_text[2:].strip()
 
 
82
  if processed_text:
83
  rate_str = f"{current_rate:+d}%"
84
  pitch_str = f"{current_pitch:+d}Hz"
85
- with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
86
- audio_path = tmp_file.name
87
- await communicate.save(audio_path)
88
- if target_duration_ms is not None and os.path.exists(audio_path):
89
- audio = AudioSegment.from_mp3(audio_path)
90
- audio_duration_ms = len(audio)
91
- #print(f"Generated audio duration: {audio_duration_ms}ms, Target duration: {target_duration_ms}ms") # Debug
92
- if audio_duration_ms > target_duration_ms and target_duration_ms > 0:
93
- speed_factor = (audio_duration_ms / target_duration_ms) * speed_adjustment_factor
94
- #print(f"Speed factor (after user adjustment): {speed_factor}") # Debug
95
- if speed_factor > 0:
96
- if speed_factor < 1.0:
97
- speed_factor = 1.0
98
- y, sr = librosa.load(audio_path, sr=None)
99
- y_stretched = librosa.effects.time_stretch(y, rate=speed_factor)
100
- sf.write(audio_path, y_stretched, sr)
101
- else:
102
- print("Generated audio is not longer than target duration, no speed adjustment.") # Debug
103
- return audio_path
104
- except Exception as e:
105
- print(f"Edge TTS error processing '{processed_text}': {e}")
106
- return None
107
  return None
108
 
109
- async def process_transcript_line(line, default_voice, rate, pitch, speed_adjustment_factor):
110
- """Processes a single transcript line with HH:MM:SS,milliseconds - HH:MM:SS,milliseconds timestamp."""
111
- match = re.match(r'(\d{2}):(\d{2}):(\d{2}),(\d{3})\s+-\s+(\d{2}):(\d{2}):(\d{2}),(\d{3})\s+(.*)', line)
112
  if match:
113
- start_h, start_m, start_s, start_ms, end_h, end_m, end_s, end_ms, text_parts = match.groups()
114
  start_time_ms = (
115
- int(start_h) * 3600000 +
116
- int(start_m) * 60000 +
117
- int(start_s) * 1000 +
118
- int(start_ms)
119
- )
120
- end_time_ms = (
121
- int(end_h) * 3600000 +
122
- int(end_m) * 60000 +
123
- int(end_s) * 1000 +
124
- int(end_ms)
125
  )
126
- duration_ms = end_time_ms - start_time_ms
127
-
128
  audio_segments = []
129
- split_parts = re.split(r'[“”"]', text_parts)
 
130
  process_next = False
131
  for part in split_parts:
132
  if part == '"':
133
  process_next = not process_next
134
  continue
135
  if process_next and part.strip():
136
- audio_path = await generate_audio_with_voice_prefix(part, default_voice, rate, pitch, duration_ms, speed_adjustment_factor)
137
  if audio_path:
138
  audio_segments.append(audio_path)
139
  elif not process_next and part.strip():
140
- audio_path = await generate_audio_with_voice_prefix(part, default_voice, rate, pitch, duration_ms, speed_adjustment_factor)
141
  if audio_path:
142
  audio_segments.append(audio_path)
143
- return start_time_ms, audio_segments, duration_ms
144
- return None, None, None
145
 
146
- async def transcript_to_speech(transcript_text, voice, rate, pitch, speed_adjustment_factor):
 
147
 
 
148
  if not transcript_text.strip():
149
  return None, gr.Warning("Please enter transcript text.")
150
  if not voice:
@@ -153,47 +201,92 @@ async def transcript_to_speech(transcript_text, voice, rate, pitch, speed_adjust
153
  lines = transcript_text.strip().split('\n')
154
  timed_audio_segments = []
155
  max_end_time_ms = 0
 
156
  for line in lines:
157
- start_time, audio_paths, duration = await process_transcript_line(line, voice, rate, pitch, speed_adjustment_factor)
158
  if start_time is not None and audio_paths:
159
  combined_line_audio = AudioSegment.empty()
160
- current_time_ms = start_time
161
- segment_duration = duration / len(audio_paths) if audio_paths else 0
162
  for path in audio_paths:
163
- if path: # Only process if audio_path is not None (meaning TTS was successful)
164
- try:
165
- audio = AudioSegment.from_mp3(path)
166
- combined_line_audio += audio
167
- os.remove(path)
168
- except FileNotFoundError:
169
- print(f"Warning: Audio file not found: {path}")
170
  if combined_line_audio:
171
  timed_audio_segments.append({'start': start_time, 'audio': combined_line_audio})
172
  max_end_time_ms = max(max_end_time_ms, start_time + len(combined_line_audio))
173
  elif audio_paths:
174
  for path in audio_paths:
175
- if path:
176
- try:
177
- os.remove(path)
178
- except FileNotFoundError:
179
- pass # Clean up even if no timestamp
180
  if not timed_audio_segments:
181
  return None, "No processable audio segments found."
 
182
  final_audio = AudioSegment.silent(duration=max_end_time_ms, frame_rate=24000)
183
  for segment in timed_audio_segments:
184
  final_audio = final_audio.overlay(segment['audio'], position=segment['start'])
 
185
  combined_audio_path = tempfile.mktemp(suffix=".mp3")
186
  final_audio.export(combined_audio_path, format="mp3")
187
  return combined_audio_path, None
188
 
189
  @spaces.GPU
190
- def tts_interface(transcript, voice, rate, pitch, speed_adjustment_factor):
191
-
192
- audio, warning = asyncio.run(transcript_to_speech(transcript, voice, rate, pitch, speed_adjustment_factor))
193
  return audio, warning
194
 
195
  async def create_demo():
196
-
197
  voices = await get_voices()
198
  default_voice = "en-US-AndrewMultilingualNeural - en-US (Male)"
199
- description = """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spaces
2
+ import gradio as gr
3
+ import edge_tts
4
+ import asyncio
5
+ import tempfile
6
+ import os
7
+ import re
8
+ from pathlib import Path
9
+ from pydub import AudioSegment
10
 
11
  def get_silence(duration_ms=1000):
12
  # Create silent audio segment with specified parameters
 
14
  duration=duration_ms,
15
  frame_rate=24000 # 24kHz sampling rate
16
  )
17
+
18
  # Set audio parameters
19
  silent_audio = silent_audio.set_channels(1) # Mono
20
  silent_audio = silent_audio.set_sample_width(4) # 32-bit (4 bytes per sample)
21
+
22
  with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
23
  # Export with specific bitrate and codec parameters
24
  silent_audio.export(
 
36
 
37
  # Get all available voices
38
  async def get_voices():
39
+ voices = await edge_tts.list_voices()
40
+ return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
41
 
42
+ async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pitch):
43
+ """Generates audio for a text segment, handling voice prefixes."""
 
 
 
 
44
  current_voice_full = default_voice
45
  current_voice_short = current_voice_full.split(" - ")[0] if current_voice_full else ""
46
  current_rate = rate
47
  current_pitch = pitch
48
  processed_text = text_segment.strip()
49
+ voice1_full = "en-AU-WilliamNeural - en-AU (Male)"
50
+ voice1_short = voice1_full.split(" - ")[0]
51
+ voice1F_full ="en-GB-SoniaNeural - en-GB (Female)"
52
+ voice1F_short = voice1F_full.split(" - ")[0]
53
+ voice2_full = "en-GB-RyanNeural - en-GB (Male)"
54
+ voice2_short = voice2_full.split(" - ")[0]
55
+ voice2F_full = "en-US-JennyNeural - en-US (Female)"
56
+ voice2F_short = voice2F_full.split(" - ")[0]
57
+ voice3_full ="en-US-BrianMultilingualNeural - en-US (Male)" #good for reading
58
+ voice3_short = voice3_full.split(" - ")[0]
59
+ voice3F_full = "en-HK-YanNeural - en-HK (Female)"
60
+ voice3F_short = voice3F_full.split(" - ")[0]
61
+ voice4_full = "en-GB-ThomasNeural - en-GB (Male)"
62
+ voice4_short = voice4_full.split(" - ")[0]
63
+ voice4F_full ="en-US-EmmaNeural - en-US (Female)"
64
+ voice4F_short = voice4F_full.split(" - ")[0]
65
+ voice5_full = "en-GB-RyanNeural - en-GB (Male)" #Old Man
66
+ voice5_short = voice5_full.split(" - ")[0]
67
+ voice6_full = "en-GB-MaisieNeural - en-GB (Female)" #Child
68
+ voice6_short = voice6_full.split(" - ")[0]
69
+ voice7_full = "vi-VN-HoaiMyNeural - vi-VN (Female)" #Vietnamese
70
+ voice7_short = voice7_full.split(" - ")[0]
71
+ voice8_full = "vi-VN-NamMinhNeural - vi-VN (Male)" #Vietnamese
72
+ voice8_short = voice8_full.split(" - ")[0]
73
+ voice9F_full = "de-DE-SeraphinaMultilingualNeural - de-DE (Female)" #Vietnamese
74
+ voice9F_short = voice7_full.split(" - ")[0]
75
+ voice9_full = "ko-KR-HyunsuMultilingualNeural - ko-KR (Male)" #Vietnamese
76
+ voice9_short = voice8_full.split(" - ")[0]
77
+ detect=0
78
+ if processed_text.startswith("1F"):
79
+ current_voice_short = voice1F_short
80
+ current_pitch = 25
81
+ detect=1
82
+ #processed_text = processed_text[2:].strip()
83
+ elif processed_text.startswith("2F"):
84
+ current_voice_short = voice2F_short
85
+ #processed_text = processed_text[2:].strip()
86
+ detect=1
87
+ elif processed_text.startswith("3F"):
88
+ current_voice_short = voice3F_short
89
+ #processed_text = processed_text[2:].strip()
90
+ detect=1
91
+ elif processed_text.startswith("4F"):
92
+ current_voice_short = voice4F_short
93
+ #processed_text = processed_text[2:].strip()
94
+ detect=1
95
+ elif processed_text.startswith("1M"):
96
+ current_voice_short = voice1_short
97
+ #processed_text = processed_text[2:].strip()
98
+ detect=1
99
+ elif processed_text.startswith("2M"):
100
+ current_voice_short = voice2_short
101
+ #processed_text = processed_text[2:].strip()
102
+ detect=1
103
+ elif processed_text.startswith("3M"):
104
+ current_voice_short = voice3_short
105
+ #processed_text = processed_text[2:].strip()
106
+ detect=1
107
+ elif processed_text.startswith("4M"):
108
+ current_voice_short = voice4_short
109
+ #processed_text = processed_text[2:].strip()
110
+ detect=1
111
+ elif processed_text.startswith("1O"): # Old man voice
112
+ current_voice_short = voice5_short
113
+ current_pitch = -20
114
+ current_rate = -10
115
+ #processed_text = processed_text[2:].strip()
116
+ detect=1
117
+ elif processed_text.startswith("1C"): #Child voice
118
+ current_voice_short = voice6_short
119
+ #processed_text = processed_text[2:].strip()
120
+ detect=1
121
+ elif processed_text.startswith("1V"): #Female VN
122
+ current_voice_short = voice7_short
123
+ #processed_text = processed_text[2:].strip()
124
+ detect=1
125
+ elif processed_text.startswith("2V"):
126
+ current_voice_short = voice8_short
127
+ #processed_text = processed_text[2:].strip()
128
+ detect=1
129
+ elif processed_text.startswith("3V"): #Female VN
130
+ current_voice_short = voice9F_short
131
+ current_pitch = 25
132
+ #processed_text = processed_text[2:].strip()
133
+ detect=1
134
+ elif processed_text.startswith("4V"):
135
+ current_voice_short = voice9_short
136
+ current_pitch = -20
137
+ #processed_text = processed_text[2:].strip()
138
+ detect=1
139
+ #Looking for number following prefix, which are pitch values.
140
+ #match = re.search(r'[A-Za-z]\d+', part) # Look for a letter followed by one or more digits
141
+ match = re.search(r'[A-Za-z]+\-?\d+', processed_text) # Look for a letter(s) followed by an optional '-' and digits
142
  if match:
143
+ # Extract the prefix (e.g., '2F') and number (e.g., '-20')
144
+ prefix = ''.join([ch for ch in match.group() if ch.isalpha()]) # Extract letters (prefix)
145
+ number = int(''.join([ch for ch in match.group() if ch.isdigit() or ch == '-'])) # Extract digits (number)
146
+ current_pitch += number
147
+ # Step 2: Remove the found number from the string
148
+ new_text = re.sub(r'[A-Za-z]+\-?\d+', '', processed_text, count=1).strip() # Remove prefix and number (e.g., '2F-20')
149
+ #processed_text = new_text[2:] #cut out the prefix like 1F, 3M etc
150
+ processed_text = new_text[len(prefix):] # Dynamically remove the prefix part
151
+ else:
152
+ if detect:
153
+ processed_text = part[2:]
154
  if processed_text:
155
  rate_str = f"{current_rate:+d}%"
156
  pitch_str = f"{current_pitch:+d}Hz"
157
+ communicate = edge_tts.Communicate(processed_text, current_voice_short, rate=rate_str, pitch=pitch_str)
158
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
159
+ audio_path = tmp_file.name
160
+ await communicate.save(audio_path)
161
+ return audio_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
  return None
163
 
164
+ async def process_transcript_line(line, default_voice, rate, pitch):
165
+ """Processes a single transcript line with HH:MM:SS.milliseconds timestamp and quoted text segments."""
166
+ match = re.match(r'(\d{2}):(\d{2}):(\d{2})\.(\d{3})\s+(.*)', line)
167
  if match:
168
+ hours, minutes, seconds, milliseconds, text_parts = match.groups()
169
  start_time_ms = (
170
+ int(hours) * 3600000 +
171
+ int(minutes) * 60000 +
172
+ int(seconds) * 1000 +
173
+ int(milliseconds)
 
 
 
 
 
 
174
  )
 
 
175
  audio_segments = []
176
+ split_parts = re.split(r'(")', text_parts) # Split by quote marks, keeping the quotes
177
+
178
  process_next = False
179
  for part in split_parts:
180
  if part == '"':
181
  process_next = not process_next
182
  continue
183
  if process_next and part.strip():
184
+ audio_path = await generate_audio_with_voice_prefix(part, default_voice, rate, pitch)
185
  if audio_path:
186
  audio_segments.append(audio_path)
187
  elif not process_next and part.strip():
188
+ audio_path = await generate_audio_with_voice_prefix(part, default_voice, rate, pitch) # Process unquoted text with default voice
189
  if audio_path:
190
  audio_segments.append(audio_path)
 
 
191
 
192
+ return start_time_ms, audio_segments
193
+ return None, None
194
 
195
+ async def transcript_to_speech(transcript_text, voice, rate, pitch):
196
  if not transcript_text.strip():
197
  return None, gr.Warning("Please enter transcript text.")
198
  if not voice:
 
201
  lines = transcript_text.strip().split('\n')
202
  timed_audio_segments = []
203
  max_end_time_ms = 0
204
+
205
  for line in lines:
206
+ start_time, audio_paths = await process_transcript_line(line, voice, rate, pitch)
207
  if start_time is not None and audio_paths:
208
  combined_line_audio = AudioSegment.empty()
 
 
209
  for path in audio_paths:
210
+ try:
211
+ audio = AudioSegment.from_mp3(path)
212
+ combined_line_audio += audio
213
+ os.remove(path)
214
+ except FileNotFoundError:
215
+ print(f"Warning: Audio file not found: {path}")
216
+
217
  if combined_line_audio:
218
  timed_audio_segments.append({'start': start_time, 'audio': combined_line_audio})
219
  max_end_time_ms = max(max_end_time_ms, start_time + len(combined_line_audio))
220
  elif audio_paths:
221
  for path in audio_paths:
222
+ try:
223
+ os.remove(path)
224
+ except FileNotFoundError:
225
+ pass # Clean up even if no timestamp
226
+
227
  if not timed_audio_segments:
228
  return None, "No processable audio segments found."
229
+
230
  final_audio = AudioSegment.silent(duration=max_end_time_ms, frame_rate=24000)
231
  for segment in timed_audio_segments:
232
  final_audio = final_audio.overlay(segment['audio'], position=segment['start'])
233
+
234
  combined_audio_path = tempfile.mktemp(suffix=".mp3")
235
  final_audio.export(combined_audio_path, format="mp3")
236
  return combined_audio_path, None
237
 
238
  @spaces.GPU
239
+ def tts_interface(transcript, voice, rate, pitch):
240
+ audio, warning = asyncio.run(transcript_to_speech(transcript, voice, rate, pitch))
 
241
  return audio, warning
242
 
243
  async def create_demo():
 
244
  voices = await get_voices()
245
  default_voice = "en-US-AndrewMultilingualNeural - en-US (Male)"
246
+ description = """
247
+ Process timestamped text (HH:MM:SS.milliseconds) with voice changes within quotes.
248
+ Format: `HH:MM:SS.milliseconds "VoicePrefix Text" more text "AnotherVoicePrefix More Text"`
249
+ Example:
250
+ ```
251
+ 00:00:00.000 "This is the default voice." more default. "1F Now a female voice." and back to default.
252
+ 00:00:05.000 "1C Yes," said the child, "it is fun!"
253
+ ```
254
+ ***************************************************************************************************
255
+ 1M = en-AU-WilliamNeural - en-AU (Male)
256
+ 1F = en-GB-SoniaNeural - en-GB (Female)
257
+ 2M = en-GB-RyanNeural - en-GB (Male)
258
+ 2F = en-US-JennyNeural - en-US (Female)
259
+ 3M = en-US-BrianMultilingualNeural - en-US (Male)
260
+ 3F = en-HK-YanNeural - en-HK (Female)
261
+ 4M = en-GB-ThomasNeural - en-GB (Male)
262
+ 4F = en-US-EmmaNeural - en-US (Female)
263
+ 1O = en-GB-RyanNeural - en-GB (Male) # Old Man
264
+ 1C = en-GB-MaisieNeural - en-GB (Female) # Child
265
+ 1V = vi-VN-HoaiMyNeural - vi-VN (Female) # Vietnamese (Female)
266
+ 2V = vi-VN-NamMinhNeural - vi-VN (Male) # Vietnamese (Male)
267
+ 3V = vi-VN-HoaiMyNeural - vi-VN (Female) # Vietnamese (Female)
268
+ 4V = vi-VN-NamMinhNeural - vi-VN (Male) # Vietnamese (Male)
269
+ ****************************************************************************************************
270
+ """
271
+ demo = gr.Interface(
272
+ fn=tts_interface,
273
+ inputs=[
274
+ gr.Textbox(label="Timestamped Text with Voice Changes", lines=10, placeholder='00:00:00.000 "Text" more text "1F Different Voice"'),
275
+ gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Default Voice", value=default_voice),
276
+ gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1),
277
+ gr.Slider(minimum=-50, maximum=50, value=0, label="Pitch Adjustment (Hz)", step=1)
278
+ ],
279
+ outputs=[
280
+ gr.Audio(label="Generated Audio", type="filepath"),
281
+ gr.Markdown(label="Warning", visible=False)
282
+ ],
283
+ title="TTS with HH:MM:SS.milliseconds and In-Quote Voice Switching",
284
+ description=description,
285
+ analytics_enabled=False,
286
+ allow_flagging=False
287
+ )
288
+ return demo
289
+
290
+ if __name__ == "__main__":
291
+ demo = asyncio.run(create_demo())
292
+ demo.launch()