avans06 commited on
Commit
4abae29
·
1 Parent(s): 404df32

Update the versions of torch, torchaudio, and gradio in requirements.txt.

Browse files
README.md CHANGED
@@ -4,7 +4,7 @@ emoji: ✨
4
  colorFrom: blue
5
  colorTo: purple
6
  sdk: gradio
7
- sdk_version: 5.9.1
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
 
4
  colorFrom: blue
5
  colorTo: purple
6
  sdk: gradio
7
+ sdk_version: 5.15.0
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
app.py CHANGED
@@ -210,6 +210,8 @@ class WhisperTranscriber:
210
  The model ensures that a sequence of words of no_repeat_ngram_size isn’t repeated in the output sequence. If specified, it must be a positive integer greater than 1.
211
  """
212
  try:
 
 
213
  whisperModelName: str = decodeOptions.pop("whisperModelName")
214
  whisperLangName: str = decodeOptions.pop("whisperLangName")
215
 
@@ -1053,18 +1055,18 @@ def create_ui(app_config: ApplicationConfig):
1053
 
1054
  uiArticle = "Read the [documentation here](https://gitlab.com/aadnk/whisper-webui/-/blob/main/docs/options.md)."
1055
 
1056
- whisper_models = app_config.get_model_names("whisper")
1057
- nllb_models = app_config.get_model_names("nllb")
1058
- m2m100_models = app_config.get_model_names("m2m100")
1059
- mt5_models = app_config.get_model_names("mt5")
1060
- ALMA_models = app_config.get_model_names("ALMA")
1061
  madlad400_models = app_config.get_model_names("madlad400")
1062
- seamless_models = app_config.get_model_names("seamless")
1063
- Llama_models = app_config.get_model_names("Llama")
1064
  if not torch.cuda.is_available(): # Loading only quantized or models with medium-low parameters in an environment without GPU support.
1065
- nllb_models = list(filter(lambda nllb: any(name in nllb for name in ["-600M", "-1.3B", "-3.3B-ct2"]), nllb_models))
1066
- m2m100_models = list(filter(lambda m2m100: "12B" not in m2m100, m2m100_models))
1067
- ALMA_models = list(filter(lambda alma: "GGUF" in alma or "ct2" in alma, ALMA_models))
1068
  madlad400_models = list(filter(lambda madlad400: "ct2" in madlad400, madlad400_models))
1069
 
1070
  common_whisper_inputs = lambda : {
@@ -1356,9 +1358,9 @@ def create_ui(app_config: ApplicationConfig):
1356
  return translation
1357
 
1358
  simpleTranscribe = create_transcribe(uiDescription, is_queue_mode)
1359
- fullDescription = uiDescription + "\n\n\n\n" + "Be careful when changing some of the options in the full interface - this can cause the model to crash."
1360
- fullTranscribe = create_transcribe(fullDescription, is_queue_mode, True)
1361
- uiTranslation = create_translation(is_queue_mode)
1362
 
1363
  demo = gr.TabbedInterface([simpleTranscribe, fullTranscribe, uiTranslation], tab_names=["Simple", "Full", "Translation"], css=css)
1364
 
@@ -1442,15 +1444,16 @@ if __name__ == '__main__':
1442
 
1443
  updated_config = default_app_config.update(**args)
1444
 
1445
- updated_config.whisper_implementation = "faster-whisper"
1446
- updated_config.input_audio_max_duration = -1
1447
- updated_config.default_model_name = "large-v2"
1448
- updated_config.output_dir = "output"
1449
- updated_config.vad_max_merge_size = 90
1450
- updated_config.merge_subtitle_with_sources = False
1451
- updated_config.autolaunch = True
1452
- updated_config.auto_parallel = False
1453
- updated_config.save_downloaded_files = True
 
1454
 
1455
  try:
1456
  if torch.cuda.is_available():
 
210
  The model ensures that a sequence of words of no_repeat_ngram_size isn’t repeated in the output sequence. If specified, it must be a positive integer greater than 1.
211
  """
212
  try:
213
+ if self.app_config.verbose:
214
+ decodeOptions["verbose"] = True
215
  whisperModelName: str = decodeOptions.pop("whisperModelName")
216
  whisperLangName: str = decodeOptions.pop("whisperLangName")
217
 
 
1055
 
1056
  uiArticle = "Read the [documentation here](https://gitlab.com/aadnk/whisper-webui/-/blob/main/docs/options.md)."
1057
 
1058
+ whisper_models = app_config.get_model_names("whisper")
1059
+ nllb_models = app_config.get_model_names("nllb")
1060
+ m2m100_models = app_config.get_model_names("m2m100")
1061
+ mt5_models = app_config.get_model_names("mt5")
1062
+ ALMA_models = app_config.get_model_names("ALMA")
1063
  madlad400_models = app_config.get_model_names("madlad400")
1064
+ seamless_models = app_config.get_model_names("seamless")
1065
+ Llama_models = app_config.get_model_names("Llama")
1066
  if not torch.cuda.is_available(): # Loading only quantized or models with medium-low parameters in an environment without GPU support.
1067
+ nllb_models = list(filter(lambda nllb: any(name in nllb for name in ["-600M", "-1.3B", "-3.3B-ct2"]), nllb_models))
1068
+ m2m100_models = list(filter(lambda m2m100: "12B" not in m2m100, m2m100_models))
1069
+ ALMA_models = list(filter(lambda alma: "GGUF" in alma or "ct2" in alma, ALMA_models))
1070
  madlad400_models = list(filter(lambda madlad400: "ct2" in madlad400, madlad400_models))
1071
 
1072
  common_whisper_inputs = lambda : {
 
1358
  return translation
1359
 
1360
  simpleTranscribe = create_transcribe(uiDescription, is_queue_mode)
1361
+ fullDescription = uiDescription + "\n\n\n\n" + "Be careful when changing some of the options in the full interface - this can cause the model to crash."
1362
+ fullTranscribe = create_transcribe(fullDescription, is_queue_mode, True)
1363
+ uiTranslation = create_translation(is_queue_mode)
1364
 
1365
  demo = gr.TabbedInterface([simpleTranscribe, fullTranscribe, uiTranslation], tab_names=["Simple", "Full", "Translation"], css=css)
1366
 
 
1444
 
1445
  updated_config = default_app_config.update(**args)
1446
 
1447
+ # updated_config.whisper_implementation = "faster-whisper"
1448
+ # updated_config.input_audio_max_duration = -1
1449
+ # updated_config.default_model_name = "large-v2"
1450
+ # updated_config.output_dir = "output"
1451
+ # updated_config.vad_max_merge_size = 90
1452
+ # updated_config.merge_subtitle_with_sources = False
1453
+ # updated_config.autolaunch = True
1454
+ # updated_config.auto_parallel = False
1455
+ # updated_config.save_downloaded_files = True
1456
+ # updated_config.verbose = True
1457
 
1458
  try:
1459
  if torch.cuda.is_available():
config.json5 CHANGED
@@ -367,7 +367,7 @@
367
  // Device to use for PyTorch inference, or Null to use the default device
368
  "device": null,
369
  // Whether to print out the progress and debug messages
370
- "verbose": true,
371
  // Whether to perform X->X speech recognition ('transcribe') or X->English translation ('translate')
372
  "task": "transcribe",
373
  // Language spoken in the audio, specify None to perform language detection
 
367
  // Device to use for PyTorch inference, or Null to use the default device
368
  "device": null,
369
  // Whether to print out the progress and debug messages
370
+ "verbose": false,
371
  // Whether to perform X->X speech recognition ('transcribe') or X->English translation ('translate')
372
  "task": "transcribe",
373
  // Language spoken in the audio, specify None to perform language detection
requirements-fasterWhisper.txt CHANGED
@@ -1,12 +1,18 @@
1
- transformers>=4.45.2
 
 
2
  ctranslate2>=4.4.0
3
  faster-whisper>=1.0.3
4
  ffmpeg-python==0.2.0
5
- gradio==5.9.1
6
  yt-dlp
7
  json5
8
- torch
9
- torchaudio
 
 
 
 
10
  more_itertools
11
  zhconv
12
  sentencepiece
 
1
+ --extra-index-url https://download.pytorch.org/whl/cu124
2
+
3
+ transformers>=4.45.2
4
  ctranslate2>=4.4.0
5
  faster-whisper>=1.0.3
6
  ffmpeg-python==0.2.0
7
+ gradio==5.15.0
8
  yt-dlp
9
  json5
10
+
11
+ torch==2.5.0+cu124; sys_platform != 'darwin'
12
+ torchaudio==2.5.0+cu124; sys_platform != 'darwin'
13
+ torch==2.5.0; sys_platform == 'darwin'
14
+ torchaudio==2.5.0; sys_platform == 'darwin'
15
+
16
  more_itertools
17
  zhconv
18
  sentencepiece
requirements-whisper.txt CHANGED
@@ -1,12 +1,18 @@
1
- transformers>=4.45.2
 
 
2
  ctranslate2>=4.4.0
3
  git+https://github.com/openai/whisper.git
4
  ffmpeg-python==0.2.0
5
- gradio==5.9.1
6
  yt-dlp
7
  json5
8
- torch
9
- torchaudio
 
 
 
 
10
  altair
11
  zhconv
12
  sentencepiece
 
1
+ --extra-index-url https://download.pytorch.org/whl/cu124
2
+
3
+ transformers>=4.45.2
4
  ctranslate2>=4.4.0
5
  git+https://github.com/openai/whisper.git
6
  ffmpeg-python==0.2.0
7
+ gradio==5.15.0
8
  yt-dlp
9
  json5
10
+
11
+ torch==2.5.0+cu124; sys_platform != 'darwin'
12
+ torchaudio==2.5.0+cu124; sys_platform != 'darwin'
13
+ torch==2.5.0; sys_platform == 'darwin'
14
+ torchaudio==2.5.0; sys_platform == 'darwin'
15
+
16
  altair
17
  zhconv
18
  sentencepiece
requirements.txt CHANGED
@@ -1,12 +1,18 @@
1
- transformers>=4.45.2
 
 
2
  ctranslate2>=4.4.0
3
  faster-whisper>=1.0.3
4
  ffmpeg-python==0.2.0
5
- gradio==5.9.1
6
  yt-dlp
7
  json5
8
- torch
9
- torchaudio
 
 
 
 
10
  more_itertools
11
  zhconv
12
  sentencepiece
 
1
+ --extra-index-url https://download.pytorch.org/whl/cu124
2
+
3
+ transformers>=4.45.2
4
  ctranslate2>=4.4.0
5
  faster-whisper>=1.0.3
6
  ffmpeg-python==0.2.0
7
+ gradio==5.15.0
8
  yt-dlp
9
  json5
10
+
11
+ torch==2.5.0+cu124; sys_platform != 'darwin'
12
+ torchaudio==2.5.0+cu124; sys_platform != 'darwin'
13
+ torch==2.5.0; sys_platform == 'darwin'
14
+ torchaudio==2.5.0; sys_platform == 'darwin'
15
+
16
  more_itertools
17
  zhconv
18
  sentencepiece
src/config.py CHANGED
@@ -58,7 +58,7 @@ class ApplicationConfig:
58
  vad_parallel_devices: str = "", vad_cpu_cores: int = 1, vad_process_timeout: int = 1800,
59
  auto_parallel: bool = False, output_dir: str = None,
60
  model_dir: str = None, device: str = None,
61
- verbose: bool = True, task: str = "transcribe", language: str = None,
62
  vad_initial_prompt_mode: str = "prepend_first_segment ",
63
  vad_merge_window: float = 5, vad_max_merge_size: float = 30,
64
  vad_padding: float = 1, vad_prompt_window: float = 3,
 
58
  vad_parallel_devices: str = "", vad_cpu_cores: int = 1, vad_process_timeout: int = 1800,
59
  auto_parallel: bool = False, output_dir: str = None,
60
  model_dir: str = None, device: str = None,
61
+ verbose: bool = False, task: str = "transcribe", language: str = None,
62
  vad_initial_prompt_mode: str = "prepend_first_segment ",
63
  vad_merge_window: float = 5, vad_max_merge_size: float = 30,
64
  vad_padding: float = 1, vad_prompt_window: float = 3,
src/vad.py CHANGED
@@ -14,12 +14,12 @@ from src.modelCache import GLOBAL_MODEL_CACHE, ModelCache
14
  from src.segments import merge_timestamps
15
  from src.whisper.abstractWhisperContainer import AbstractWhisperCallback
16
 
17
- # Workaround for https://github.com/tensorflow/tensorflow/issues/48797
18
- try:
19
- import tensorflow as tf
20
- except ModuleNotFoundError:
21
- # Error handling
22
- pass
23
 
24
  import torch
25
 
@@ -621,7 +621,7 @@ class VadSileroTranscription(AbstractTranscription):
621
  chunk_start += chunk_duration
622
 
623
  perf_end_time = time.perf_counter()
624
- print("VAD processing took {} seconds".format(perf_end_time - perf_start_time))
625
 
626
  return result
627
 
 
14
  from src.segments import merge_timestamps
15
  from src.whisper.abstractWhisperContainer import AbstractWhisperCallback
16
 
17
+ # # Workaround for https://github.com/tensorflow/tensorflow/issues/48797
18
+ # try:
19
+ # import tensorflow as tf
20
+ # except ModuleNotFoundError:
21
+ # # Error handling
22
+ # pass
23
 
24
  import torch
25
 
 
621
  chunk_start += chunk_duration
622
 
623
  perf_end_time = time.perf_counter()
624
+ print(f"VAD processing took {perf_end_time - perf_start_time} seconds, from {start_time} to {end_time}")
625
 
626
  return result
627
 
src/vadParallel.py CHANGED
@@ -108,6 +108,24 @@ class ParallelTranscription(AbstractTranscription):
108
  def transcribe_parallel(self, transcription: AbstractTranscription, audio: str, whisperCallable: AbstractWhisperCallback, config: TranscriptionConfig,
109
  cpu_device_count: int, gpu_devices: List[str], cpu_parallel_context: ParallelContext = None, gpu_parallel_context: ParallelContext = None,
110
  progress_listener: ProgressListener = None):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
  total_duration = get_audio_duration(audio)
112
 
113
  # First, get the timestamps for the original audio
@@ -212,6 +230,20 @@ class ParallelTranscription(AbstractTranscription):
212
 
213
  def _get_merged_timestamps_parallel(self, transcription: AbstractTranscription, audio: str, config: TranscriptionConfig, total_duration: float,
214
  cpu_device_count: int, cpu_parallel_context: ParallelContext = None):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
215
  parameters = []
216
 
217
  chunk_size = max(total_duration / cpu_device_count, self.MIN_CPU_CHUNK_SIZE_SECONDS)
@@ -228,8 +260,7 @@ class ParallelTranscription(AbstractTranscription):
228
  # No need to process chunks that are less than 1 second
229
  break
230
 
231
- print("Parallel VAD: Executing chunk from " + str(chunk_start) + " to " +
232
- str(chunk_end) + " on CPU device " + str(cpu_device_id))
233
  parameters.append([audio, config, chunk_start, chunk_end]);
234
 
235
  cpu_device_id += 1
@@ -258,7 +289,7 @@ class ParallelTranscription(AbstractTranscription):
258
  merged = transcription.get_merged_timestamps(timestamps, config, total_duration)
259
 
260
  perf_end_time = time.perf_counter()
261
- print("Parallel VAD processing took {} seconds".format(perf_end_time - perf_start_time))
262
  return merged
263
 
264
  finally:
@@ -273,6 +304,17 @@ class ParallelTranscription(AbstractTranscription):
273
  return []
274
 
275
  def get_merged_timestamps(self, timestamps: List[Dict[str, Any]], config: ParallelTranscriptionConfig, total_duration: float):
 
 
 
 
 
 
 
 
 
 
 
276
  # Override timestamps that will be processed
277
  if (config.override_timestamps is not None):
278
  print("(get_merged_timestamps) Using override timestamps of size " + str(len(config.override_timestamps)))
@@ -281,6 +323,18 @@ class ParallelTranscription(AbstractTranscription):
281
 
282
  def transcribe(self, audio: str, whisperCallable: AbstractWhisperCallback, config: ParallelTranscriptionConfig,
283
  progressListener: ProgressListener = None):
 
 
 
 
 
 
 
 
 
 
 
 
284
  # Override device ID the first time
285
  if (os.environ.get("INITIALIZED", None) is None):
286
  os.environ["INITIALIZED"] = "1"
@@ -294,7 +348,15 @@ class ParallelTranscription(AbstractTranscription):
294
  return super().transcribe(audio, whisperCallable, config, progressListener)
295
 
296
  def _split(self, a, n):
297
- """Split a list into n approximately equal parts."""
 
 
 
 
 
 
 
 
298
  k, m = divmod(len(a), n)
299
  return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n))
300
 
 
108
  def transcribe_parallel(self, transcription: AbstractTranscription, audio: str, whisperCallable: AbstractWhisperCallback, config: TranscriptionConfig,
109
  cpu_device_count: int, gpu_devices: List[str], cpu_parallel_context: ParallelContext = None, gpu_parallel_context: ParallelContext = None,
110
  progress_listener: ProgressListener = None):
111
+
112
+ """
113
+ Perform parallel transcription of an audio file using CPU and GPU.
114
+
115
+ Args:
116
+ transcription (AbstractTranscription): The transcription instance handling processing.
117
+ audio (str): Path to the audio file to be transcribed.
118
+ whisperCallable (AbstractWhisperCallback): Callback to interact with the Whisper model.
119
+ config (TranscriptionConfig): Configuration for transcription settings.
120
+ cpu_device_count (int): Number of CPU devices to use for processing.
121
+ gpu_devices (List[str]): List of GPU device IDs to use for processing.
122
+ cpu_parallel_context (ParallelContext, optional): Context for managing CPU parallel execution.
123
+ gpu_parallel_context (ParallelContext, optional): Context for managing GPU parallel execution.
124
+ progress_listener (ProgressListener, optional): Listener for tracking transcription progress.
125
+
126
+ Returns:
127
+ dict: Merged transcription results containing text, segments, and detected language.
128
+ """
129
  total_duration = get_audio_duration(audio)
130
 
131
  # First, get the timestamps for the original audio
 
230
 
231
  def _get_merged_timestamps_parallel(self, transcription: AbstractTranscription, audio: str, config: TranscriptionConfig, total_duration: float,
232
  cpu_device_count: int, cpu_parallel_context: ParallelContext = None):
233
+ """
234
+ Compute merged timestamps for transcription in parallel using CPU.
235
+
236
+ Args:
237
+ transcription (AbstractTranscription): The transcription instance handling timestamp processing.
238
+ audio (str): Path to the audio file.
239
+ config (TranscriptionConfig): Configuration settings for timestamp processing.
240
+ total_duration (float): Total duration of the audio file in seconds.
241
+ cpu_device_count (int): Number of CPU devices to use.
242
+ cpu_parallel_context (ParallelContext, optional): Context for managing CPU parallel execution.
243
+
244
+ Returns:
245
+ list: Merged timestamps after processing.
246
+ """
247
  parameters = []
248
 
249
  chunk_size = max(total_duration / cpu_device_count, self.MIN_CPU_CHUNK_SIZE_SECONDS)
 
260
  # No need to process chunks that are less than 1 second
261
  break
262
 
263
+ print(f"Parallel VAD: Executing chunk from {chunk_start} to {chunk_end} on CPU device {cpu_device_id}")
 
264
  parameters.append([audio, config, chunk_start, chunk_end]);
265
 
266
  cpu_device_id += 1
 
289
  merged = transcription.get_merged_timestamps(timestamps, config, total_duration)
290
 
291
  perf_end_time = time.perf_counter()
292
+ print(f"Parallel VAD processing took {perf_end_time - perf_start_time} seconds")
293
  return merged
294
 
295
  finally:
 
304
  return []
305
 
306
  def get_merged_timestamps(self, timestamps: List[Dict[str, Any]], config: ParallelTranscriptionConfig, total_duration: float):
307
+ """
308
+ Merge timestamps from different transcription segments.
309
+
310
+ Args:
311
+ timestamps (List[Dict[str, Any]]): List of timestamp dictionaries from different segments.
312
+ config (ParallelTranscriptionConfig): Configuration settings for merging timestamps.
313
+ total_duration (float): Total duration of the audio file in seconds.
314
+
315
+ Returns:
316
+ list: Merged timestamps after processing.
317
+ """
318
  # Override timestamps that will be processed
319
  if (config.override_timestamps is not None):
320
  print("(get_merged_timestamps) Using override timestamps of size " + str(len(config.override_timestamps)))
 
323
 
324
  def transcribe(self, audio: str, whisperCallable: AbstractWhisperCallback, config: ParallelTranscriptionConfig,
325
  progressListener: ProgressListener = None):
326
+ """
327
+ Perform transcription on a given audio file using the specified device.
328
+
329
+ Args:
330
+ audio (str): Path to the audio file to be transcribed.
331
+ whisperCallable (AbstractWhisperCallback): Callback to interact with the Whisper model.
332
+ config (ParallelTranscriptionConfig): Configuration settings for transcription.
333
+ progressListener (ProgressListener, optional): Listener for tracking transcription progress.
334
+
335
+ Returns:
336
+ dict: Transcription results including text, segments, and detected language.
337
+ """
338
  # Override device ID the first time
339
  if (os.environ.get("INITIALIZED", None) is None):
340
  os.environ["INITIALIZED"] = "1"
 
348
  return super().transcribe(audio, whisperCallable, config, progressListener)
349
 
350
  def _split(self, a, n):
351
+ """Split a list into n approximately equal parts.
352
+
353
+ Args:
354
+ a (List[Any]): The list to be split.
355
+ n (int): The number of parts to split the list into.
356
+
357
+ Returns:
358
+ generator: A generator yielding n sublists.
359
+ """
360
  k, m = divmod(len(a), n)
361
  return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n))
362