cnph001 commited on
Commit
fba3345
·
verified ·
1 Parent(s): affede7

Update app.py

Browse files

Change to work with srt style text: (00:02:18,541 - 00:02:21,458 Hãy cùng nhau lấy chúng, lấy chúng! )

Files changed (1) hide show
  1. app.py +88 -127
app.py CHANGED
@@ -7,6 +7,8 @@ import os
7
  import re
8
  from pathlib import Path
9
  from pydub import AudioSegment
 
 
10
 
11
  def get_silence(duration_ms=1000):
12
  # Create silent audio segment with specified parameters
@@ -39,118 +41,55 @@ async def get_voices():
39
  voices = await edge_tts.list_voices()
40
  return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
41
 
42
- async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pitch):
43
- """Generates audio for a text segment, handling voice prefixes."""
44
  current_voice_full = default_voice
45
  current_voice_short = current_voice_full.split(" - ")[0] if current_voice_full else ""
46
  current_rate = rate
47
  current_pitch = pitch
48
  processed_text = text_segment.strip()
49
- voice1_full = "en-AU-WilliamNeural - en-AU (Male)"
50
- voice1_short = voice1_full.split(" - ")[0]
51
- voice1F_full ="en-GB-SoniaNeural - en-GB (Female)"
52
- voice1F_short = voice1F_full.split(" - ")[0]
53
- voice2_full = "en-GB-RyanNeural - en-GB (Male)"
54
- voice2_short = voice2_full.split(" - ")[0]
55
- voice2F_full = "en-US-JennyNeural - en-US (Female)"
56
- voice2F_short = voice2F_full.split(" - ")[0]
57
- voice3_full ="en-US-BrianMultilingualNeural - en-US (Male)" #good for reading
58
- voice3_short = voice3_full.split(" - ")[0]
59
- voice3F_full = "en-HK-YanNeural - en-HK (Female)"
60
- voice3F_short = voice3F_full.split(" - ")[0]
61
- voice4_full = "en-GB-ThomasNeural - en-GB (Male)"
62
- voice4_short = voice4_full.split(" - ")[0]
63
- voice4F_full ="en-US-EmmaNeural - en-US (Female)"
64
- voice4F_short = voice4F_full.split(" - ")[0]
65
- voice5_full = "en-GB-RyanNeural - en-GB (Male)" #Old Man
66
- voice5_short = voice5_full.split(" - ")[0]
67
- voice6_full = "en-GB-MaisieNeural - en-GB (Female)" #Child
68
- voice6_short = voice6_full.split(" - ")[0]
69
- voice7_full = "vi-VN-HoaiMyNeural - vi-VN (Female)" #Vietnamese
70
- voice7_short = voice7_full.split(" - ")[0]
71
- voice8_full = "vi-VN-NamMinhNeural - vi-VN (Male)" #Vietnamese
72
- voice8_short = voice8_full.split(" - ")[0]
73
- voice9F_full = "de-DE-SeraphinaMultilingualNeural - de-DE (Female)" #Vietnamese
74
- voice9F_short = voice7_full.split(" - ")[0]
75
- voice9_full = "ko-KR-HyunsuMultilingualNeural - ko-KR (Male)" #Vietnamese
76
- voice9_short = voice8_full.split(" - ")[0]
77
- detect=0
78
- if processed_text.startswith("1F"):
79
- current_voice_short = voice1F_short
80
- current_pitch = 25
81
- detect=1
82
- #processed_text = processed_text[2:].strip()
83
- elif processed_text.startswith("2F"):
84
- current_voice_short = voice2F_short
85
- #processed_text = processed_text[2:].strip()
86
- detect=1
87
- elif processed_text.startswith("3F"):
88
- current_voice_short = voice3F_short
89
- #processed_text = processed_text[2:].strip()
90
- detect=1
91
- elif processed_text.startswith("4F"):
92
- current_voice_short = voice4F_short
93
- #processed_text = processed_text[2:].strip()
94
- detect=1
95
- elif processed_text.startswith("1M"):
96
- current_voice_short = voice1_short
97
- #processed_text = processed_text[2:].strip()
98
- detect=1
99
- elif processed_text.startswith("2M"):
100
- current_voice_short = voice2_short
101
- #processed_text = processed_text[2:].strip()
102
- detect=1
103
- elif processed_text.startswith("3M"):
104
- current_voice_short = voice3_short
105
- #processed_text = processed_text[2:].strip()
106
- detect=1
107
- elif processed_text.startswith("4M"):
108
- current_voice_short = voice4_short
109
- #processed_text = processed_text[2:].strip()
110
- detect=1
111
- elif processed_text.startswith("1O"): # Old man voice
112
- current_voice_short = voice5_short
113
- current_pitch = -20
114
- current_rate = -10
115
- #processed_text = processed_text[2:].strip()
116
- detect=1
117
- elif processed_text.startswith("1C"): #Child voice
118
- current_voice_short = voice6_short
119
- #processed_text = processed_text[2:].strip()
120
- detect=1
121
- elif processed_text.startswith("1V"): #Female VN
122
- current_voice_short = voice7_short
123
- #processed_text = processed_text[2:].strip()
124
- detect=1
125
- elif processed_text.startswith("2V"):
126
- current_voice_short = voice8_short
127
- #processed_text = processed_text[2:].strip()
128
- detect=1
129
- elif processed_text.startswith("3V"): #Female VN
130
- current_voice_short = voice9F_short
131
- current_pitch = 25
132
- #processed_text = processed_text[2:].strip()
133
- detect=1
134
- elif processed_text.startswith("4V"):
135
- current_voice_short = voice9_short
136
- current_pitch = -20
137
- #processed_text = processed_text[2:].strip()
138
- detect=1
139
- #Looking for number following prefix, which are pitch values.
140
- #match = re.search(r'[A-Za-z]\d+', part) # Look for a letter followed by one or more digits
141
- match = re.search(r'[A-Za-z]+\-?\d+', processed_text) # Look for a letter(s) followed by an optional '-' and digits
142
  if match:
143
- # Extract the prefix (e.g., '2F') and number (e.g., '-20')
144
- prefix = ''.join([ch for ch in match.group() if ch.isalpha()]) # Extract letters (prefix)
145
- number = int(''.join([ch for ch in match.group() if ch.isdigit() or ch == '-'])) # Extract digits (number)
146
- current_pitch += number
147
- # Step 2: Remove the found number from the string
148
- new_text = re.sub(r'[A-Za-z]+\-?\d+', '', processed_text, count=1).strip() # Remove prefix and number (e.g., '2F-20')
149
- #processed_text = new_text[2:] #cut out the prefix like 1F, 3M etc
150
- processed_text = new_text[len(prefix):] # Dynamically remove the prefix part
151
- else:
152
- if detect:
153
- processed_text = processed_text[2:].strip()
154
  if processed_text:
155
  rate_str = f"{current_rate:+d}%"
156
  pitch_str = f"{current_pitch:+d}Hz"
@@ -158,40 +97,56 @@ async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pi
158
  with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
159
  audio_path = tmp_file.name
160
  await communicate.save(audio_path)
 
 
 
 
 
 
 
 
 
 
 
161
  return audio_path
162
  return None
163
 
164
  async def process_transcript_line(line, default_voice, rate, pitch):
165
- """Processes a single transcript line with HH:MM:SS.milliseconds timestamp and quoted text segments."""
166
- match = re.match(r'(\d{2}):(\d{2}):(\d{2})\.(\d{3})\s+(.*)', line)
167
  if match:
168
- hours, minutes, seconds, milliseconds, text_parts = match.groups()
169
  start_time_ms = (
170
- int(hours) * 3600000 +
171
- int(minutes) * 60000 +
172
- int(seconds) * 1000 +
173
- int(milliseconds)
174
  )
 
 
 
 
 
 
 
175
  audio_segments = []
176
- #split_parts = re.split(r'(")', text_parts) # Split by quote marks, keeping the quotes
177
- split_parts = re.split(r'[“”"]', text_parts) # Split by quote marks, keeping the quotes
178
- #paragraphs = [p.strip() for p in re.split(r'[“”"]', text) if p.strip()]
179
  process_next = False
180
  for part in split_parts:
181
  if part == '"':
182
  process_next = not process_next
183
  continue
184
  if process_next and part.strip():
185
- audio_path = await generate_audio_with_voice_prefix(part, default_voice, rate, pitch)
186
  if audio_path:
187
  audio_segments.append(audio_path)
188
  elif not process_next and part.strip():
189
- audio_path = await generate_audio_with_voice_prefix(part, default_voice, rate, pitch) # Process unquoted text with default voice
190
  if audio_path:
191
  audio_segments.append(audio_path)
192
 
193
- return start_time_ms, audio_segments
194
- return None, None
195
 
196
  async def transcript_to_speech(transcript_text, voice, rate, pitch):
197
  if not transcript_text.strip():
@@ -204,12 +159,16 @@ async def transcript_to_speech(transcript_text, voice, rate, pitch):
204
  max_end_time_ms = 0
205
 
206
  for line in lines:
207
- start_time, audio_paths = await process_transcript_line(line, voice, rate, pitch)
208
  if start_time is not None and audio_paths:
209
  combined_line_audio = AudioSegment.empty()
 
 
 
210
  for path in audio_paths:
211
  try:
212
  audio = AudioSegment.from_mp3(path)
 
213
  combined_line_audio += audio
214
  os.remove(path)
215
  except FileNotFoundError:
@@ -245,12 +204,13 @@ async def create_demo():
245
  voices = await get_voices()
246
  default_voice = "en-US-AndrewMultilingualNeural - en-US (Male)"
247
  description = """
248
- Process timestamped text (HH:MM:SS.milliseconds) with voice changes within quotes.
249
- Format: `HH:MM:SS.milliseconds "VoicePrefix Text" more text "AnotherVoicePrefix More Text"`
 
250
  Example:
251
  ```
252
- 00:00:00.000 "This is the default voice." more default. "1F Now a female voice." and back to default.
253
- 00:00:10.000 "1C Yes," said the child, "it is fun!"
254
  ```
255
  ***************************************************************************************************
256
  1M = en-AU-WilliamNeural - en-AU (Male)
@@ -272,7 +232,7 @@ async def create_demo():
272
  demo = gr.Interface(
273
  fn=tts_interface,
274
  inputs=[
275
- gr.Textbox(label="Timestamped Text with Voice Changes", lines=10, placeholder='00:00:00.000 "Text" more text "1F Different Voice"'),
276
  gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Default Voice", value=default_voice),
277
  gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1),
278
  gr.Slider(minimum=-50, maximum=50, value=0, label="Pitch Adjustment (Hz)", step=1)
@@ -281,7 +241,7 @@ async def create_demo():
281
  gr.Audio(label="Generated Audio", type="filepath"),
282
  gr.Markdown(label="Warning", visible=False)
283
  ],
284
- title="TTS with HH:MM:SS.milliseconds and In-Quote Voice Switching",
285
  description=description,
286
  analytics_enabled=False,
287
  allow_flagging=False
@@ -289,5 +249,6 @@ async def create_demo():
289
  return demo
290
 
291
  if __name__ == "__main__":
 
292
  demo = asyncio.run(create_demo())
293
  demo.launch()
 
7
  import re
8
  from pathlib import Path
9
  from pydub import AudioSegment
10
+ import librosa
11
+ import numpy as np
12
 
13
  def get_silence(duration_ms=1000):
14
  # Create silent audio segment with specified parameters
 
41
  voices = await edge_tts.list_voices()
42
  return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
43
 
44
+ async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pitch, target_duration_ms=None):
45
+ """Generates audio for a text segment, handling voice prefixes and adjusting rate for duration."""
46
  current_voice_full = default_voice
47
  current_voice_short = current_voice_full.split(" - ")[0] if current_voice_full else ""
48
  current_rate = rate
49
  current_pitch = pitch
50
  processed_text = text_segment.strip()
51
+ voice_map = {
52
+ "1F": "en-GB-SoniaNeural",
53
+ "2M": "en-GB-RyanNeural",
54
+ "3M": "en-US-BrianMultilingualNeural",
55
+ "2F": "en-US-JennyNeural",
56
+ "1M": "en-AU-WilliamNeural",
57
+ "3F": "en-HK-YanNeural",
58
+ "4M": "en-GB-ThomasNeural",
59
+ "4F": "en-US-EmmaNeural",
60
+ "1O": "en-GB-RyanNeural", # Old Man
61
+ "1C": "en-GB-MaisieNeural", # Child
62
+ "1V": "vi-VN-HoaiMyNeural", # Vietnamese (Female)
63
+ "2V": "vi-VN-NamMinhNeural", # Vietnamese (Male)
64
+ "3V": "vi-VN-HoaiMyNeural", # Vietnamese (Female)
65
+ "4V": "vi-VN-NamMinhNeural", # Vietnamese (Male)
66
+ }
67
+ detect = 0
68
+ for prefix, voice_short in voice_map.items():
69
+ if processed_text.startswith(prefix):
70
+ current_voice_short = voice_short
71
+ if prefix in ["1F", "3F", "1V", "3V"]:
72
+ current_pitch = 25
73
+ elif prefix in ["1O", "4V"]:
74
+ current_pitch = -20
75
+ current_rate = -10
76
+ detect = 1
77
+ processed_text = processed_text[len(prefix):].strip()
78
+ break
79
+
80
+ match = re.search(r'([A-Za-z]+)-?(\d+)', processed_text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  if match:
82
+ prefix_pitch = match.group(1)
83
+ number = int(match.group(2))
84
+ if prefix_pitch in voice_map:
85
+ current_pitch += number
86
+ processed_text = re.sub(r'[A-Za-z]+-?\d+', '', processed_text, count=1).strip()
87
+ elif detect:
88
+ processed_text = processed_text.lstrip('-0123456789').strip() # Remove potential leftover numbers
89
+
90
+ elif detect:
91
+ processed_text = processed_text[2:].strip()
92
+
93
  if processed_text:
94
  rate_str = f"{current_rate:+d}%"
95
  pitch_str = f"{current_pitch:+d}Hz"
 
97
  with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
98
  audio_path = tmp_file.name
99
  await communicate.save(audio_path)
100
+
101
+ if target_duration_ms is not None and os.path.exists(audio_path):
102
+ audio = AudioSegment.from_mp3(audio_path)
103
+ audio_duration_ms = len(audio)
104
+ if audio_duration_ms > 0 and target_duration_ms > 0:
105
+ speed_factor = audio_duration_ms / target_duration_ms
106
+ if speed_factor > 0:
107
+ # Use librosa for time stretching with better quality for speech
108
+ y, sr = librosa.load(audio_path, sr=None)
109
+ y_stretched = librosa.effects.time_stretch(y, rate=speed_factor)
110
+ sf.write(audio_path, y_stretched, sr)
111
  return audio_path
112
  return None
113
 
114
  async def process_transcript_line(line, default_voice, rate, pitch):
115
+ """Processes a single transcript line with HH:MM:SS,milliseconds - HH:MM:SS,milliseconds timestamp."""
116
+ match = re.match(r'(\d{2}):(\d{2}):(\d{2}),(\d{3})\s+-\s+(\d{2}):(\d{2}):(\d{2}),(\d{3})\s+(.*)', line)
117
  if match:
118
+ start_h, start_m, start_s, start_ms, end_h, end_m, end_s, end_ms, text_parts = match.groups()
119
  start_time_ms = (
120
+ int(start_h) * 3600000 +
121
+ int(start_m) * 60000 +
122
+ int(start_s) * 1000 +
123
+ int(start_ms)
124
  )
125
+ end_time_ms = (
126
+ int(end_h) * 3600000 +
127
+ int(end_m) * 60000 +
128
+ int(end_s) * 1000 +
129
+ int(end_ms)
130
+ )
131
+ duration_ms = end_time_ms - start_time_ms
132
  audio_segments = []
133
+ split_parts = re.split(r'[“”"]', text_parts)
 
 
134
  process_next = False
135
  for part in split_parts:
136
  if part == '"':
137
  process_next = not process_next
138
  continue
139
  if process_next and part.strip():
140
+ audio_path = await generate_audio_with_voice_prefix(part, default_voice, rate, pitch, duration_ms)
141
  if audio_path:
142
  audio_segments.append(audio_path)
143
  elif not process_next and part.strip():
144
+ audio_path = await generate_audio_with_voice_prefix(part, default_voice, rate, pitch, duration_ms)
145
  if audio_path:
146
  audio_segments.append(audio_path)
147
 
148
+ return start_time_ms, audio_segments, duration_ms
149
+ return None, None, None
150
 
151
  async def transcript_to_speech(transcript_text, voice, rate, pitch):
152
  if not transcript_text.strip():
 
159
  max_end_time_ms = 0
160
 
161
  for line in lines:
162
+ start_time, audio_paths, duration = await process_transcript_line(line, voice, rate, pitch)
163
  if start_time is not None and audio_paths:
164
  combined_line_audio = AudioSegment.empty()
165
+ current_time_ms = start_time
166
+ segment_duration = duration / len(audio_paths) if audio_paths else 0
167
+
168
  for path in audio_paths:
169
  try:
170
  audio = AudioSegment.from_mp3(path)
171
+ # No need to adjust speed here, it's done in generate_audio_with_voice_prefix
172
  combined_line_audio += audio
173
  os.remove(path)
174
  except FileNotFoundError:
 
204
  voices = await get_voices()
205
  default_voice = "en-US-AndrewMultilingualNeural - en-US (Male)"
206
  description = """
207
+ Process timestamped text (HH:MM:SS,milliseconds - HH:MM:SS,milliseconds) with voice changes within quotes.
208
+ The duration specified in the timestamp will be used to adjust the speech rate so the generated audio fits within that time.
209
+ Format: `HH:MM:SS,milliseconds - HH:MM:SS,milliseconds "VoicePrefix Text" more text "AnotherVoicePrefix More Text"`
210
  Example:
211
  ```
212
+ 00:00:00,000 - 00:00:05,000 "This is the default voice." more default. "1F Now a female voice." and back to default.
213
+ 00:00:05,500 - 00:00:10,250 "1C Yes," said the child, "it is fun!"
214
  ```
215
  ***************************************************************************************************
216
  1M = en-AU-WilliamNeural - en-AU (Male)
 
232
  demo = gr.Interface(
233
  fn=tts_interface,
234
  inputs=[
235
+ gr.Textbox(label="Timestamped Text with Voice Changes and Duration", lines=10, placeholder='00:00:00,000 - 00:00:05,000 "Text" more text "1F Different Voice"'),
236
  gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Default Voice", value=default_voice),
237
  gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1),
238
  gr.Slider(minimum=-50, maximum=50, value=0, label="Pitch Adjustment (Hz)", step=1)
 
241
  gr.Audio(label="Generated Audio", type="filepath"),
242
  gr.Markdown(label="Warning", visible=False)
243
  ],
244
+ title="TTS with Duration-Aware Speed Adjustment and In-Quote Voice Switching",
245
  description=description,
246
  analytics_enabled=False,
247
  allow_flagging=False
 
249
  return demo
250
 
251
  if __name__ == "__main__":
252
+ import soundfile as sf # Import soundfile here
253
  demo = asyncio.run(create_demo())
254
  demo.launch()