cnph001 commited on
Commit
06acc33
·
verified ·
1 Parent(s): d9e25a8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -15
app.py CHANGED
@@ -8,6 +8,7 @@ import re
8
  from pathlib import Path
9
  from pydub import AudioSegment
10
  import librosa
 
11
  import numpy as np
12
 
13
  def get_silence(duration_ms=1000):
@@ -41,7 +42,7 @@ async def get_voices():
41
  voices = await edge_tts.list_voices()
42
  return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
43
 
44
- async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pitch, target_duration_ms=None):
45
  """Generates audio for a text segment, handling voice prefixes and adjusting rate for duration."""
46
  current_voice_full = default_voice
47
  current_voice_short = current_voice_full.split(" - ")[0] if current_voice_full else ""
@@ -104,10 +105,9 @@ async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pi
104
  print(f"Generated audio duration: {audio_duration_ms}ms, Target duration: {target_duration_ms}ms") # Debug
105
 
106
  if audio_duration_ms > target_duration_ms and target_duration_ms > 0:
107
- speed_factor = (audio_duration_ms / target_duration_ms)
108
- print(f"Speed factor (to reduce duration): {speed_factor}") # Debug
109
  if speed_factor > 0:
110
- speed_factor = speed_factor * 0.8 # Reduce the speed adjustment
111
  if speed_factor <1.0:
112
  speed_factor = 1.0
113
  y, sr = librosa.load(audio_path, sr=None)
@@ -119,7 +119,7 @@ async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pi
119
  return audio_path
120
  return None
121
 
122
- async def process_transcript_line(line, default_voice, rate, pitch):
123
  """Processes a single transcript line with HH:MM:SS,milliseconds - HH:MM:SS,milliseconds timestamp."""
124
  match = re.match(r'(\d{2}):(\d{2}):(\d{2}),(\d{3})\s+-\s+(\d{2}):(\d{2}):(\d{2}),(\d{3})\s+(.*)', line)
125
  if match:
@@ -145,18 +145,18 @@ async def process_transcript_line(line, default_voice, rate, pitch):
145
  process_next = not process_next
146
  continue
147
  if process_next and part.strip():
148
- audio_path = await generate_audio_with_voice_prefix(part, default_voice, rate, pitch, duration_ms)
149
  if audio_path:
150
  audio_segments.append(audio_path)
151
  elif not process_next and part.strip():
152
- audio_path = await generate_audio_with_voice_prefix(part, default_voice, rate, pitch, duration_ms)
153
  if audio_path:
154
  audio_segments.append(audio_path)
155
 
156
  return start_time_ms, audio_segments, duration_ms
157
  return None, None, None
158
 
159
- async def transcript_to_speech(transcript_text, voice, rate, pitch):
160
  if not transcript_text.strip():
161
  return None, gr.Warning("Please enter transcript text.")
162
  if not voice:
@@ -167,7 +167,7 @@ async def transcript_to_speech(transcript_text, voice, rate, pitch):
167
  max_end_time_ms = 0
168
 
169
  for line in lines:
170
- start_time, audio_paths, duration = await process_transcript_line(line, voice, rate, pitch)
171
  if start_time is not None and audio_paths:
172
  combined_line_audio = AudioSegment.empty()
173
  current_time_ms = start_time
@@ -176,7 +176,6 @@ async def transcript_to_speech(transcript_text, voice, rate, pitch):
176
  for path in audio_paths:
177
  try:
178
  audio = AudioSegment.from_mp3(path)
179
- # No need to adjust speed here, it's done in generate_audio_with_voice_prefix
180
  combined_line_audio += audio
181
  os.remove(path)
182
  except FileNotFoundError:
@@ -204,8 +203,8 @@ async def transcript_to_speech(transcript_text, voice, rate, pitch):
204
  return combined_audio_path, None
205
 
206
  @spaces.GPU
207
- def tts_interface(transcript, voice, rate, pitch):
208
- audio, warning = asyncio.run(transcript_to_speech(transcript, voice, rate, pitch))
209
  return audio, warning
210
 
211
  async def create_demo():
@@ -214,6 +213,7 @@ async def create_demo():
214
  description = """
215
  Process timestamped text (HH:MM:SS,milliseconds - HH:MM:SS,milliseconds) with voice changes within quotes.
216
  The duration specified in the timestamp will be used to adjust the speech rate so the generated audio fits within that time.
 
217
  Format: `HH:MM:SS,milliseconds - HH:MM:SS,milliseconds "VoicePrefix Text" more text "AnotherVoicePrefix More Text"`
218
  Example:
219
  ```
@@ -243,7 +243,8 @@ async def create_demo():
243
  gr.Textbox(label="Timestamped Text with Voice Changes and Duration", lines=10, placeholder='00:00:00,000 - 00:00:05,000 "Text" more text "1F Different Voice"'),
244
  gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Default Voice", value=default_voice),
245
  gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1),
246
- gr.Slider(minimum=-50, maximum=50, value=0, label="Pitch Adjustment (Hz)", step=1)
 
247
  ],
248
  outputs=[
249
  gr.Audio(label="Generated Audio", type="filepath"),
@@ -257,6 +258,5 @@ async def create_demo():
257
  return demo
258
 
259
  if __name__ == "__main__":
260
- import soundfile as sf # Import soundfile here
261
  demo = asyncio.run(create_demo())
262
- demo.launch()
 
8
  from pathlib import Path
9
  from pydub import AudioSegment
10
  import librosa
11
+ import soundfile as sf
12
  import numpy as np
13
 
14
  def get_silence(duration_ms=1000):
 
42
  voices = await edge_tts.list_voices()
43
  return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
44
 
45
+ async def generate_audio_with_voice_prefix(text_segment, default_voice, rate, pitch, target_duration_ms=None, speed_adjustment_factor=1.0):
46
  """Generates audio for a text segment, handling voice prefixes and adjusting rate for duration."""
47
  current_voice_full = default_voice
48
  current_voice_short = current_voice_full.split(" - ")[0] if current_voice_full else ""
 
105
  print(f"Generated audio duration: {audio_duration_ms}ms, Target duration: {target_duration_ms}ms") # Debug
106
 
107
  if audio_duration_ms > target_duration_ms and target_duration_ms > 0:
108
+ speed_factor = (audio_duration_ms / target_duration_ms) * speed_adjustment_factor
109
+ print(f"Speed factor (after user adjustment): {speed_factor}") # Debug
110
  if speed_factor > 0:
 
111
  if speed_factor <1.0:
112
  speed_factor = 1.0
113
  y, sr = librosa.load(audio_path, sr=None)
 
119
  return audio_path
120
  return None
121
 
122
+ async def process_transcript_line(line, default_voice, rate, pitch, speed_adjustment_factor):
123
  """Processes a single transcript line with HH:MM:SS,milliseconds - HH:MM:SS,milliseconds timestamp."""
124
  match = re.match(r'(\d{2}):(\d{2}):(\d{2}),(\d{3})\s+-\s+(\d{2}):(\d{2}):(\d{2}),(\d{3})\s+(.*)', line)
125
  if match:
 
145
  process_next = not process_next
146
  continue
147
  if process_next and part.strip():
148
+ audio_path = await generate_audio_with_voice_prefix(part, default_voice, rate, pitch, duration_ms, speed_adjustment_factor)
149
  if audio_path:
150
  audio_segments.append(audio_path)
151
  elif not process_next and part.strip():
152
+ audio_path = await generate_audio_with_voice_prefix(part, default_voice, rate, pitch, duration_ms, speed_adjustment_factor)
153
  if audio_path:
154
  audio_segments.append(audio_path)
155
 
156
  return start_time_ms, audio_segments, duration_ms
157
  return None, None, None
158
 
159
+ async def transcript_to_speech(transcript_text, voice, rate, pitch, speed_adjustment_factor):
160
  if not transcript_text.strip():
161
  return None, gr.Warning("Please enter transcript text.")
162
  if not voice:
 
167
  max_end_time_ms = 0
168
 
169
  for line in lines:
170
+ start_time, audio_paths, duration = await process_transcript_line(line, voice, rate, pitch, speed_adjustment_factor)
171
  if start_time is not None and audio_paths:
172
  combined_line_audio = AudioSegment.empty()
173
  current_time_ms = start_time
 
176
  for path in audio_paths:
177
  try:
178
  audio = AudioSegment.from_mp3(path)
 
179
  combined_line_audio += audio
180
  os.remove(path)
181
  except FileNotFoundError:
 
203
  return combined_audio_path, None
204
 
205
  @spaces.GPU
206
+ def tts_interface(transcript, voice, rate, pitch, speed_adjustment_factor):
207
+ audio, warning = asyncio.run(transcript_to_speech(transcript, voice, rate, pitch, speed_adjustment_factor))
208
  return audio, warning
209
 
210
  async def create_demo():
 
213
  description = """
214
  Process timestamped text (HH:MM:SS,milliseconds - HH:MM:SS,milliseconds) with voice changes within quotes.
215
  The duration specified in the timestamp will be used to adjust the speech rate so the generated audio fits within that time.
216
+ You can control the intensity of the speed adjustment using the "Speed Adjustment Factor" slider.
217
  Format: `HH:MM:SS,milliseconds - HH:MM:SS,milliseconds "VoicePrefix Text" more text "AnotherVoicePrefix More Text"`
218
  Example:
219
  ```
 
243
  gr.Textbox(label="Timestamped Text with Voice Changes and Duration", lines=10, placeholder='00:00:00,000 - 00:00:05,000 "Text" more text "1F Different Voice"'),
244
  gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Default Voice", value=default_voice),
245
  gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate Adjustment (%)", step=1),
246
+ gr.Slider(minimum=-50, maximum=50, value=0, label="Pitch Adjustment (Hz)", step=1),
247
+ gr.Slider(minimum=0.5, maximum=1.5, value=1.0, step=0.05, label="Speed Adjustment Factor")
248
  ],
249
  outputs=[
250
  gr.Audio(label="Generated Audio", type="filepath"),
 
258
  return demo
259
 
260
  if __name__ == "__main__":
 
261
  demo = asyncio.run(create_demo())
262
+ demo.launch()