cnph001 commited on
Commit
2899f8f
·
verified ·
1 Parent(s): 37ad470

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +123 -158
app.py CHANGED
@@ -7,6 +7,9 @@ import os
7
  import re
8
  from pathlib import Path
9
  from pydub import AudioSegment
 
 
 
10
 
11
  def get_silence(duration_ms=1000):
12
  # Create silent audio segment with specified parameters
@@ -14,11 +17,9 @@ def get_silence(duration_ms=1000):
14
  duration=duration_ms,
15
  frame_rate=24000 # 24kHz sampling rate
16
  )
17
-
18
  # Set audio parameters
19
  silent_audio = silent_audio.set_channels(1) # Mono
20
  silent_audio = silent_audio.set_sample_width(4) # 32-bit (4 bytes per sample)
21
-
22
  with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
23
  # Export with specific bitrate and codec parameters
24
  silent_audio.export(
@@ -36,220 +37,183 @@ def get_silence(duration_ms=1000):
36
 
37
  # Get all available voices
38
  async def get_voices():
39
- voices = await edge_tts.list_voices()
40
- return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
 
 
 
 
41
 
42
- async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pitch):
43
- """Generates audio for a text segment, handling voice prefixes."""
44
  current_voice_full = default_voice
45
  current_voice_short = current_voice_full.split(" - ")[0] if current_voice_full else ""
46
  current_rate = rate
47
  current_pitch = pitch
48
  processed_text = text_segment.strip()
49
- voice1_full = "en-AU-WilliamNeural - en-AU (Male)"
50
- voice1_short = voice1_full.split(" - ")[0]
51
- voice1F_full ="en-GB-SoniaNeural - en-GB (Female)"
52
- voice1F_short = voice1F_full.split(" - ")[0]
53
- voice2_full = "en-GB-RyanNeural - en-GB (Male)"
54
- voice2_short = voice2_full.split(" - ")[0]
55
- voice2F_full = "en-US-JennyNeural - en-US (Female)"
56
- voice2F_short = voice2F_full.split(" - ")[0]
57
- voice3_full ="en-US-BrianMultilingualNeural - en-US (Male)" #good for reading
58
- voice3_short = voice3_full.split(" - ")[0]
59
- voice3F_full = "en-HK-YanNeural - en-HK (Female)"
60
- voice3F_short = voice3F_full.split(" - ")[0]
61
- voice4_full = "en-GB-ThomasNeural - en-GB (Male)"
62
- voice4_short = voice4_full.split(" - ")[0]
63
- voice4F_full ="en-US-EmmaNeural - en-US (Female)"
64
- voice4F_short = voice4F_full.split(" - ")[0]
65
- voice5_full = "en-GB-RyanNeural - en-GB (Male)" #Old Man
66
- voice5_short = voice5_full.split(" - ")[0]
67
- voice6_full = "en-GB-MaisieNeural - en-GB (Female)" #Child
68
- voice6_short = voice6_full.split(" - ")[0]
69
- voice7_full = "vi-VN-HoaiMyNeural - vi-VN (Female)" #Vietnamese
70
- voice7_short = voice7_full.split(" - ")[0]
71
- voice8_full = "vi-VN-NamMinhNeural - vi-VN (Male)" #Vietnamese
72
- voice8_short = voice8_full.split(" - ")[0]
73
- voice9F_full = "de-DE-SeraphinaMultilingualNeural - de-DE (Female)" #Vietnamese
74
- voice9F_short = voice7_full.split(" - ")[0]
75
- voice9_full = "ko-KR-HyunsuMultilingualNeural - ko-KR (Male)" #Vietnamese
76
- voice9_short = voice8_full.split(" - ")[0]
77
- detect=0
78
- if processed_text.startswith("1F"):
79
- current_voice_short = voice1F_short
80
- current_pitch = 25
81
- detect=1
82
- #processed_text = processed_text[2:].strip()
83
- elif processed_text.startswith("2F"):
84
- current_voice_short = voice2F_short
85
- #processed_text = processed_text[2:].strip()
86
- detect=1
87
- elif processed_text.startswith("3F"):
88
- current_voice_short = voice3F_short
89
- #processed_text = processed_text[2:].strip()
90
- detect=1
91
- elif processed_text.startswith("4F"):
92
- current_voice_short = voice4F_short
93
- #processed_text = processed_text[2:].strip()
94
- detect=1
95
- elif processed_text.startswith("1M"):
96
- current_voice_short = voice1_short
97
- #processed_text = processed_text[2:].strip()
98
- detect=1
99
- elif processed_text.startswith("2M"):
100
- current_voice_short = voice2_short
101
- #processed_text = processed_text[2:].strip()
102
- detect=1
103
- elif processed_text.startswith("3M"):
104
- current_voice_short = voice3_short
105
- #processed_text = processed_text[2:].strip()
106
- detect=1
107
- elif processed_text.startswith("4M"):
108
- current_voice_short = voice4_short
109
- #processed_text = processed_text[2:].strip()
110
- detect=1
111
- elif processed_text.startswith("1O"): # Old man voice
112
- current_voice_short = voice5_short
113
- current_pitch = -20
114
- current_rate = -10
115
- #processed_text = processed_text[2:].strip()
116
- detect=1
117
- elif processed_text.startswith("1C"): #Child voice
118
- current_voice_short = voice6_short
119
- #processed_text = processed_text[2:].strip()
120
- detect=1
121
- elif processed_text.startswith("1V"): #Female VN
122
- current_voice_short = voice7_short
123
- #processed_text = processed_text[2:].strip()
124
- detect=1
125
- elif processed_text.startswith("2V"):
126
- current_voice_short = voice8_short
127
- #processed_text = processed_text[2:].strip()
128
- detect=1
129
- elif processed_text.startswith("3V"): #Female VN
130
- current_voice_short = voice9F_short
131
- current_pitch = 25
132
- #processed_text = processed_text[2:].strip()
133
- detect=1
134
- elif processed_text.startswith("4V"):
135
- current_voice_short = voice9_short
136
- current_pitch = -20
137
- #processed_text = processed_text[2:].strip()
138
- detect=1
139
- #Looking for number following prefix, which are pitch values.
140
- #match = re.search(r'[A-Za-z]\d+', part) # Look for a letter followed by one or more digits
141
- match = re.search(r'[A-Za-z]+\-?\d+', processed_text) # Look for a letter(s) followed by an optional '-' and digits
142
  if match:
143
- # Extract the prefix (e.g., '2F') and number (e.g., '-20')
144
- prefix = ''.join([ch for ch in match.group() if ch.isalpha()]) # Extract letters (prefix)
145
- number = int(''.join([ch for ch in match.group() if ch.isdigit() or ch == '-'])) # Extract digits (number)
146
- current_pitch += number
147
- # Step 2: Remove the found number from the string
148
- new_text = re.sub(r'[A-Za-z]+\-?\d+', '', processed_text, count=1).strip() # Remove prefix and number (e.g., '2F-20')
149
- #processed_text = new_text[2:] #cut out the prefix like 1F, 3M etc
150
- processed_text = new_text[len(prefix):] # Dynamically remove the prefix part
151
- else:
152
- if detect:
153
- processed_text = processed_text[2:]
154
  if processed_text:
155
  rate_str = f"{current_rate:+d}%"
156
  pitch_str = f"{current_pitch:+d}Hz"
157
- communicate = edge_tts.Communicate(processed_text, current_voice_short, rate=rate_str, pitch=pitch_str)
158
- with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
159
- audio_path = tmp_file.name
160
- await communicate.save(audio_path)
161
- return audio_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
  return None
163
 
164
- async def process_transcript_line(line, default_voice, rate, pitch):
165
- """Processes a single transcript line with HH:MM:SS.milliseconds timestamp and quoted text segments."""
166
- match = re.match(r'(\d{2}):(\d{2}):(\d{2})\.(\d{3})\s+(.*)', line)
167
  if match:
168
- hours, minutes, seconds, milliseconds, text_parts = match.groups()
169
  start_time_ms = (
170
- int(hours) * 3600000 +
171
- int(minutes) * 60000 +
172
- int(seconds) * 1000 +
173
- int(milliseconds)
 
 
 
 
 
 
174
  )
 
175
  audio_segments = []
176
- split_parts = re.split(r'(")', text_parts) # Split by quote marks, keeping the quotes
177
-
178
  process_next = False
179
  for part in split_parts:
180
  if part == '"':
181
  process_next = not process_next
182
  continue
183
  if process_next and part.strip():
184
- audio_path = await generate_audio_with_voice_prefix(part, default_voice, rate, pitch)
185
  if audio_path:
186
  audio_segments.append(audio_path)
187
  elif not process_next and part.strip():
188
- audio_path = await generate_audio_with_voice_prefix(part, default_voice, rate, pitch) # Process unquoted text with default voice
189
  if audio_path:
190
  audio_segments.append(audio_path)
 
 
191
 
192
- return start_time_ms, audio_segments
193
- return None, None
194
-
195
- async def transcript_to_speech(transcript_text, voice, rate, pitch):
196
  if not transcript_text.strip():
197
  return None, gr.Warning("Please enter transcript text.")
198
  if not voice:
199
  return None, gr.Warning("Please select a voice.")
200
-
201
  lines = transcript_text.strip().split('\n')
202
  timed_audio_segments = []
203
  max_end_time_ms = 0
204
-
205
  for line in lines:
206
- start_time, audio_paths = await process_transcript_line(line, voice, rate, pitch)
207
  if start_time is not None and audio_paths:
208
  combined_line_audio = AudioSegment.empty()
 
 
209
  for path in audio_paths:
210
- try:
211
- audio = AudioSegment.from_mp3(path)
212
- combined_line_audio += audio
213
- os.remove(path)
214
- except FileNotFoundError:
215
- print(f"Warning: Audio file not found: {path}")
216
-
217
  if combined_line_audio:
218
  timed_audio_segments.append({'start': start_time, 'audio': combined_line_audio})
219
  max_end_time_ms = max(max_end_time_ms, start_time + len(combined_line_audio))
220
  elif audio_paths:
221
  for path in audio_paths:
222
- try:
223
- os.remove(path)
224
- except FileNotFoundError:
225
- pass # Clean up even if no timestamp
226
-
227
  if not timed_audio_segments:
228
  return None, "No processable audio segments found."
229
-
230
  final_audio = AudioSegment.silent(duration=max_end_time_ms, frame_rate=24000)
231
  for segment in timed_audio_segments:
232
  final_audio = final_audio.overlay(segment['audio'], position=segment['start'])
233
-
234
  combined_audio_path = tempfile.mktemp(suffix=".mp3")
235
  final_audio.export(combined_audio_path, format="mp3")
236
  return combined_audio_path, None
237
 
238
  @spaces.GPU
239
- def tts_interface(transcript, voice, rate, pitch):
240
- audio, warning = asyncio.run(transcript_to_speech(transcript, voice, rate, pitch))
241
  return audio, warning
242
 
243
  async def create_demo():
244
  voices = await get_voices()
245
  default_voice = "en-US-AndrewMultilingualNeural - en-US (Male)"
246
  description = """
247
- Process timestamped text (HH:MM:SS.milliseconds) with voice changes within quotes.
248
- Format: `HH:MM:SS.milliseconds "VoicePrefix Text" more text "AnotherVoicePrefix More Text"`
 
 
249
  Example:
250
  ```
251
- 00:00:00.000 "This is the default voice." more default. "1F Now a female voice." and back to default.
252
- 00:00:05.000 "1C Yes," said the child, "it is fun!"
253
  ```
254
  ***************************************************************************************************
255
  1M = en-AU-WilliamNeural - en-AU (Male)
@@ -271,16 +235,17 @@ async def create_demo():
271
  demo = gr.Interface(
272
  fn=tts_interface,
273
  inputs=[
274
- gr.Textbox(label="Timestamped Text with Voice Changes", lines=10, placeholder='00:00:00.000 "Text" more text "1F Different Voice"'),
275
  gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Default Voice", value=default_voice),
276
  gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1),
277
- gr.Slider(minimum=-50, maximum=50, value=0, label="Pitch Adjustment (Hz)", step=1)
 
278
  ],
279
  outputs=[
280
  gr.Audio(label="Generated Audio", type="filepath"),
281
  gr.Markdown(label="Warning", visible=False)
282
  ],
283
- title="TTS with HH:MM:SS.milliseconds and In-Quote Voice Switching",
284
  description=description,
285
  analytics_enabled=False,
286
  allow_flagging=False
@@ -289,4 +254,4 @@ async def create_demo():
289
 
290
  if __name__ == "__main__":
291
  demo = asyncio.run(create_demo())
292
- demo.launch()
 
7
  import re
8
  from pathlib import Path
9
  from pydub import AudioSegment
10
+ import librosa
11
+ import soundfile as sf
12
+ import numpy as np
13
 
14
  def get_silence(duration_ms=1000):
15
  # Create silent audio segment with specified parameters
 
17
  duration=duration_ms,
18
  frame_rate=24000 # 24kHz sampling rate
19
  )
 
20
  # Set audio parameters
21
  silent_audio = silent_audio.set_channels(1) # Mono
22
  silent_audio = silent_audio.set_sample_width(4) # 32-bit (4 bytes per sample)
 
23
  with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
24
  # Export with specific bitrate and codec parameters
25
  silent_audio.export(
 
37
 
38
  # Get all available voices
39
  async def get_voices():
40
+ try:
41
+ voices = await edge_tts.list_voices()
42
+ return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
43
+ except Exception as e:
44
+ print(f"Error listing voices: {e}")
45
+ return {}
46
 
47
+ async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pitch, target_duration_ms=None, speed_adjustment_factor=1.0):
48
+ """Generates audio for a text segment, handling voice prefixes and adjusting rate for duration."""
49
  current_voice_full = default_voice
50
  current_voice_short = current_voice_full.split(" - ")[0] if current_voice_full else ""
51
  current_rate = rate
52
  current_pitch = pitch
53
  processed_text = text_segment.strip()
54
+ print(f"Processing this text segment: {processed_text}") # Debug
55
+ voice_map = {
56
+ "1F": "en-GB-SoniaNeural",
57
+ "2M": "en-GB-RyanNeural",
58
+ "3M": "en-US-BrianMultilingualNeural",
59
+ "2F": "en-US-JennyNeural",
60
+ "1M": "en-AU-WilliamNeural",
61
+ "3F": "en-HK-YanNeural",
62
+ "4M": "en-GB-ThomasNeural",
63
+ "4F": "en-US-EmmaNeural",
64
+ "1O": "en-GB-RyanNeural", # Old Man
65
+ "1C": "en-GB-MaisieNeural", # Child
66
+ "1V": "vi-VN-HoaiMyNeural", # Vietnamese (Female)
67
+ "2V": "vi-VN-NamMinhNeural", # Vietnamese (Male)
68
+ "3V": "vi-VN-HoaiMyNeural", # Vietnamese (Female)
69
+ "4V": "vi-VN-NamMinhNeural", # Vietnamese (Male)
70
+ }
71
+ detect = 0
72
+ for prefix, voice_short in voice_map.items():
73
+ if processed_text.startswith(prefix):
74
+ current_voice_short = voice_short
75
+ if prefix in ["1F", "3F", "1V", "3V"]:
76
+ current_pitch = 25
77
+ elif prefix in ["1O", "4V"]:
78
+ current_pitch = -20
79
+ current_rate = -10
80
+ detect = 1
81
+ processed_text = processed_text[len(prefix):].strip()
82
+ break
83
+ match = re.search(r'([A-Za-z]+)-?(\d+)', processed_text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  if match:
85
+ prefix_pitch = match.group(1)
86
+ number = int(match.group(2))
87
+ if prefix_pitch in voice_map:
88
+ current_pitch += number
89
+ processed_text = re.sub(r'[A-Za-z]+-?\d+', '', processed_text, count=1).strip()
90
+ elif detect:
91
+ processed_text = processed_text.lstrip('-0123456789').strip() # Remove potential leftover numbers
92
+ elif detect:
93
+ processed_text = processed_text[2:].strip()
 
 
94
  if processed_text:
95
  rate_str = f"{current_rate:+d}%"
96
  pitch_str = f"{current_pitch:+d}Hz"
97
+ try:
98
+ communicate = edge_tts.Communicate(processed_text, current_voice_short, rate=rate_str, pitch=pitch_str)
99
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
100
+ audio_path = tmp_file.name
101
+ await communicate.save(audio_path)
102
+ if target_duration_ms is not None and os.path.exists(audio_path):
103
+ audio = AudioSegment.from_mp3(audio_path)
104
+ audio_duration_ms = len(audio)
105
+ #print(f"Generated audio duration: {audio_duration_ms}ms, Target duration: {target_duration_ms}ms") # Debug
106
+ if audio_duration_ms > target_duration_ms and target_duration_ms > 0:
107
+ speed_factor = (audio_duration_ms / target_duration_ms) * speed_adjustment_factor
108
+ #print(f"Speed factor (after user adjustment): {speed_factor}") # Debug
109
+ if speed_factor > 0:
110
+ if speed_factor < 1.0:
111
+ speed_factor = 1.0
112
+ y, sr = librosa.load(audio_path, sr=None)
113
+ y_stretched = librosa.effects.time_stretch(y, rate=speed_factor)
114
+ sf.write(audio_path, y_stretched, sr)
115
+ else:
116
+ print("Generated audio is not longer than target duration, no speed adjustment.") # Debug
117
+ return audio_path
118
+ except Exception as e:
119
+ print(f"Edge TTS error processing '{processed_text}': {e}")
120
+ return None
121
  return None
122
 
123
+ async def process_transcript_line(line, default_voice, rate, pitch, speed_adjustment_factor):
124
+ """Processes a single transcript line with HH:MM:SS,milliseconds - HH:MM:SS,milliseconds timestamp."""
125
+ match = re.match(r'(\d{2}):(\d{2}):(\d{2}),(\d{3})\s+-\s+(\d{2}):(\d{2}):(\d{2}),(\d{3})\s+(.*)', line)
126
  if match:
127
+ start_h, start_m, start_s, start_ms, end_h, end_m, end_s, end_ms, text_parts = match.groups()
128
  start_time_ms = (
129
+ int(start_h) * 3600000 +
130
+ int(start_m) * 60000 +
131
+ int(start_s) * 1000 +
132
+ int(start_ms)
133
+ )
134
+ end_time_ms = (
135
+ int(end_h) * 3600000 +
136
+ int(end_m) * 60000 +
137
+ int(end_s) * 1000 +
138
+ int(end_ms)
139
  )
140
+ duration_ms = end_time_ms - start_time_ms
141
  audio_segments = []
142
+ split_parts = re.split(r'[“”"]', text_parts)
 
143
  process_next = False
144
  for part in split_parts:
145
  if part == '"':
146
  process_next = not process_next
147
  continue
148
  if process_next and part.strip():
149
+ audio_path = await generate_audio_with_voice_prefix(part, default_voice, rate, pitch, duration_ms, speed_adjustment_factor)
150
  if audio_path:
151
  audio_segments.append(audio_path)
152
  elif not process_next and part.strip():
153
+ audio_path = await generate_audio_with_voice_prefix(part, default_voice, rate, pitch, duration_ms, speed_adjustment_factor)
154
  if audio_path:
155
  audio_segments.append(audio_path)
156
+ return start_time_ms, audio_segments, duration_ms
157
+ return None, None, None
158
 
159
+ async def transcript_to_speech(transcript_text, voice, rate, pitch, speed_adjustment_factor):
 
 
 
160
  if not transcript_text.strip():
161
  return None, gr.Warning("Please enter transcript text.")
162
  if not voice:
163
  return None, gr.Warning("Please select a voice.")
 
164
  lines = transcript_text.strip().split('\n')
165
  timed_audio_segments = []
166
  max_end_time_ms = 0
 
167
  for line in lines:
168
+ start_time, audio_paths, duration = await process_transcript_line(line, voice, rate, pitch, speed_adjustment_factor)
169
  if start_time is not None and audio_paths:
170
  combined_line_audio = AudioSegment.empty()
171
+ current_time_ms = start_time
172
+ segment_duration = duration / len(audio_paths) if audio_paths else 0
173
  for path in audio_paths:
174
+ if path: # Only process if audio_path is not None (meaning TTS was successful)
175
+ try:
176
+ audio = AudioSegment.from_mp3(path)
177
+ combined_line_audio += audio
178
+ os.remove(path)
179
+ except FileNotFoundError:
180
+ print(f"Warning: Audio file not found: {path}")
181
  if combined_line_audio:
182
  timed_audio_segments.append({'start': start_time, 'audio': combined_line_audio})
183
  max_end_time_ms = max(max_end_time_ms, start_time + len(combined_line_audio))
184
  elif audio_paths:
185
  for path in audio_paths:
186
+ if path:
187
+ try:
188
+ os.remove(path)
189
+ except FileNotFoundError:
190
+ pass # Clean up even if no timestamp
191
  if not timed_audio_segments:
192
  return None, "No processable audio segments found."
 
193
  final_audio = AudioSegment.silent(duration=max_end_time_ms, frame_rate=24000)
194
  for segment in timed_audio_segments:
195
  final_audio = final_audio.overlay(segment['audio'], position=segment['start'])
 
196
  combined_audio_path = tempfile.mktemp(suffix=".mp3")
197
  final_audio.export(combined_audio_path, format="mp3")
198
  return combined_audio_path, None
199
 
200
  @spaces.GPU
201
+ def tts_interface(transcript, voice, rate, pitch, speed_adjustment_factor):
202
+ audio, warning = asyncio.run(transcript_to_speech(transcript, voice, rate, pitch, speed_adjustment_factor))
203
  return audio, warning
204
 
205
  async def create_demo():
206
  voices = await get_voices()
207
  default_voice = "en-US-AndrewMultilingualNeural - en-US (Male)"
208
  description = """
209
+ Process timestamped text (HH:MM:SS,milliseconds - HH:MM:SS,milliseconds) with voice changes within quotes.
210
+ The duration specified in the timestamp will be used to adjust the speech rate so the generated audio fits within that time.
211
+ You can control the intensity of the speed adjustment using the "Speed Adjustment Factor" slider.
212
+ Format: `HH:MM:SS,milliseconds - HH:MM:SS,milliseconds "VoicePrefix Text" more text "AnotherVoicePrefix More Text"`
213
  Example:
214
  ```
215
+ 00:00:00,000 - 00:00:05,000 "This is the default voice." more default. "1F Now a female voice." and back to default.
216
+ 00:00:05,500 - 00:00:10,250 "1C Yes," said the child, "it is fun!"
217
  ```
218
  ***************************************************************************************************
219
  1M = en-AU-WilliamNeural - en-AU (Male)
 
235
  demo = gr.Interface(
236
  fn=tts_interface,
237
  inputs=[
238
+ gr.Textbox(label="Timestamped Text with Voice Changes and Duration", lines=10, placeholder='00:00:00,000 - 00:00:05,000 "Text" more text "1F Different Voice"'),
239
  gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Default Voice", value=default_voice),
240
  gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1),
241
+ gr.Slider(minimum=-50, maximum=50, value=0, label="Pitch Adjustment (Hz)", step=1),
242
+ gr.Slider(minimum=0.5, maximum=1.5, value=1.0, step=0.05, label="Speed Adjustment Factor")
243
  ],
244
  outputs=[
245
  gr.Audio(label="Generated Audio", type="filepath"),
246
  gr.Markdown(label="Warning", visible=False)
247
  ],
248
+ title="TTS with Duration-Aware Speed Adjustment and In-Quote Voice Switching",
249
  description=description,
250
  analytics_enabled=False,
251
  allow_flagging=False
 
254
 
255
  if __name__ == "__main__":
256
  demo = asyncio.run(create_demo())
257
+ demo.launch()