whisper-webui-translate

Running

App Files Files Community

avans06 commited on Feb 11

Commit

4abae29

1 Parent(s): 404df32

Update the versions of torch, torchaudio, and gradio in requirements.txt.

Browse files

Files changed (9) hide show

README.md +1 -1
app.py +25 -22
config.json5 +1 -1
requirements-fasterWhisper.txt +10 -4
requirements-whisper.txt +10 -4
requirements.txt +10 -4
src/config.py +1 -1
src/vad.py +7 -7
src/vadParallel.py +66 -4

README.md CHANGED Viewed

@@ -4,7 +4,7 @@ emoji: ✨
 colorFrom: blue
 colorTo: purple
 sdk: gradio
-sdk_version: 5.9.1
 app_file: app.py
 pinned: false
 license: apache-2.0

 colorFrom: blue
 colorTo: purple
 sdk: gradio
+sdk_version: 5.15.0
 app_file: app.py
 pinned: false
 license: apache-2.0

app.py CHANGED Viewed

@@ -210,6 +210,8 @@ class WhisperTranscriber:
             The model ensures that a sequence of words of no_repeat_ngram_size isn’t repeated in the output sequence. If specified, it must be a positive integer greater than 1.
         """
         try:
             whisperModelName: str = decodeOptions.pop("whisperModelName")
             whisperLangName:  str = decodeOptions.pop("whisperLangName")
@@ -1053,18 +1055,18 @@ def create_ui(app_config: ApplicationConfig):
     uiArticle = "Read the [documentation here](https://gitlab.com/aadnk/whisper-webui/-/blob/main/docs/options.md)."
-    whisper_models = app_config.get_model_names("whisper")
-    nllb_models = app_config.get_model_names("nllb")
-    m2m100_models = app_config.get_model_names("m2m100")
-    mt5_models = app_config.get_model_names("mt5")
-    ALMA_models = app_config.get_model_names("ALMA")
     madlad400_models = app_config.get_model_names("madlad400")
-    seamless_models = app_config.get_model_names("seamless")
-    Llama_models = app_config.get_model_names("Llama")
     if not torch.cuda.is_available(): # Loading only quantized or models with medium-low parameters in an environment without GPU support.
-        nllb_models = list(filter(lambda nllb: any(name in nllb for name in ["-600M", "-1.3B", "-3.3B-ct2"]), nllb_models))
-        m2m100_models = list(filter(lambda m2m100: "12B" not in m2m100, m2m100_models))
-        ALMA_models = list(filter(lambda alma: "GGUF" in alma or "ct2" in alma, ALMA_models))
         madlad400_models = list(filter(lambda madlad400: "ct2" in madlad400, madlad400_models))
     common_whisper_inputs = lambda : {
@@ -1356,9 +1358,9 @@ def create_ui(app_config: ApplicationConfig):
         return translation
     simpleTranscribe = create_transcribe(uiDescription, is_queue_mode)
-    fullDescription = uiDescription + "\n\n\n\n" + "Be careful when changing some of the options in the full interface - this can cause the model to crash."
-    fullTranscribe = create_transcribe(fullDescription, is_queue_mode, True)
-    uiTranslation = create_translation(is_queue_mode)
     demo = gr.TabbedInterface([simpleTranscribe, fullTranscribe, uiTranslation], tab_names=["Simple", "Full", "Translation"], css=css)
@@ -1442,15 +1444,16 @@ if __name__ == '__main__':
     updated_config = default_app_config.update(**args)
-    updated_config.whisper_implementation = "faster-whisper"
-    updated_config.input_audio_max_duration = -1
-    updated_config.default_model_name = "large-v2"
-    updated_config.output_dir = "output"
-    updated_config.vad_max_merge_size = 90
-    updated_config.merge_subtitle_with_sources = False
-    updated_config.autolaunch = True
-    updated_config.auto_parallel = False
-    updated_config.save_downloaded_files = True
     try:
         if torch.cuda.is_available():

             The model ensures that a sequence of words of no_repeat_ngram_size isn’t repeated in the output sequence. If specified, it must be a positive integer greater than 1.
         """
         try:
+            if self.app_config.verbose:
+                decodeOptions["verbose"] = True
             whisperModelName: str = decodeOptions.pop("whisperModelName")
             whisperLangName:  str = decodeOptions.pop("whisperLangName")
     uiArticle = "Read the [documentation here](https://gitlab.com/aadnk/whisper-webui/-/blob/main/docs/options.md)."
+    whisper_models   = app_config.get_model_names("whisper")
+    nllb_models      = app_config.get_model_names("nllb")
+    m2m100_models    = app_config.get_model_names("m2m100")
+    mt5_models       = app_config.get_model_names("mt5")
+    ALMA_models      = app_config.get_model_names("ALMA")
     madlad400_models = app_config.get_model_names("madlad400")
+    seamless_models  = app_config.get_model_names("seamless")
+    Llama_models     = app_config.get_model_names("Llama")
     if not torch.cuda.is_available(): # Loading only quantized or models with medium-low parameters in an environment without GPU support.
+        nllb_models      = list(filter(lambda nllb: any(name in nllb for name in ["-600M", "-1.3B", "-3.3B-ct2"]), nllb_models))
+        m2m100_models    = list(filter(lambda m2m100: "12B" not in m2m100, m2m100_models))
+        ALMA_models      = list(filter(lambda alma: "GGUF" in alma or "ct2" in alma, ALMA_models))
         madlad400_models = list(filter(lambda madlad400: "ct2" in madlad400, madlad400_models))
     common_whisper_inputs = lambda : {
         return translation
     simpleTranscribe = create_transcribe(uiDescription, is_queue_mode)
+    fullDescription  = uiDescription + "\n\n\n\n" + "Be careful when changing some of the options in the full interface - this can cause the model to crash."
+    fullTranscribe   = create_transcribe(fullDescription, is_queue_mode, True)
+    uiTranslation    = create_translation(is_queue_mode)
     demo = gr.TabbedInterface([simpleTranscribe, fullTranscribe, uiTranslation], tab_names=["Simple", "Full", "Translation"], css=css)
     updated_config = default_app_config.update(**args)
+    # updated_config.whisper_implementation = "faster-whisper"
+    # updated_config.input_audio_max_duration = -1
+    # updated_config.default_model_name = "large-v2"
+    # updated_config.output_dir = "output"
+    # updated_config.vad_max_merge_size = 90
+    # updated_config.merge_subtitle_with_sources = False
+    # updated_config.autolaunch = True
+    # updated_config.auto_parallel = False
+    # updated_config.save_downloaded_files = True
+    # updated_config.verbose = True
     try:
         if torch.cuda.is_available():

config.json5 CHANGED Viewed

@@ -367,7 +367,7 @@
   // Device to use for PyTorch inference, or Null to use the default device
   "device": null,
   // Whether to print out the progress and debug messages
-  "verbose": true,
   // Whether to perform X->X speech recognition ('transcribe') or X->English translation ('translate')
   "task": "transcribe",
   // Language spoken in the audio, specify None to perform language detection

   // Device to use for PyTorch inference, or Null to use the default device
   "device": null,
   // Whether to print out the progress and debug messages
+  "verbose": false,
   // Whether to perform X->X speech recognition ('transcribe') or X->English translation ('translate')
   "task": "transcribe",
   // Language spoken in the audio, specify None to perform language detection

requirements-fasterWhisper.txt CHANGED Viewed

@@ -1,12 +1,18 @@
-transformers>=4.45.2
 ctranslate2>=4.4.0
 faster-whisper>=1.0.3
 ffmpeg-python==0.2.0
-gradio==5.9.1
 yt-dlp
 json5
-torch
-torchaudio
 more_itertools
 zhconv
 sentencepiece

+--extra-index-url https://download.pytorch.org/whl/cu124
+transformers>=4.45.2
 ctranslate2>=4.4.0
 faster-whisper>=1.0.3
 ffmpeg-python==0.2.0
+gradio==5.15.0
 yt-dlp
 json5
+torch==2.5.0+cu124; sys_platform != 'darwin'
+torchaudio==2.5.0+cu124; sys_platform != 'darwin'
+torch==2.5.0; sys_platform == 'darwin'
+torchaudio==2.5.0; sys_platform == 'darwin'
 more_itertools
 zhconv
 sentencepiece

requirements-whisper.txt CHANGED Viewed

@@ -1,12 +1,18 @@
-transformers>=4.45.2
 ctranslate2>=4.4.0
 git+https://github.com/openai/whisper.git
 ffmpeg-python==0.2.0
-gradio==5.9.1
 yt-dlp
 json5
-torch
-torchaudio
 altair
 zhconv
 sentencepiece

+--extra-index-url https://download.pytorch.org/whl/cu124
+transformers>=4.45.2
 ctranslate2>=4.4.0
 git+https://github.com/openai/whisper.git
 ffmpeg-python==0.2.0
+gradio==5.15.0
 yt-dlp
 json5
+torch==2.5.0+cu124; sys_platform != 'darwin'
+torchaudio==2.5.0+cu124; sys_platform != 'darwin'
+torch==2.5.0; sys_platform == 'darwin'
+torchaudio==2.5.0; sys_platform == 'darwin'
 altair
 zhconv
 sentencepiece

requirements.txt CHANGED Viewed

@@ -1,12 +1,18 @@
-transformers>=4.45.2
 ctranslate2>=4.4.0
 faster-whisper>=1.0.3
 ffmpeg-python==0.2.0
-gradio==5.9.1
 yt-dlp
 json5
-torch
-torchaudio
 more_itertools
 zhconv
 sentencepiece

+--extra-index-url https://download.pytorch.org/whl/cu124
+transformers>=4.45.2
 ctranslate2>=4.4.0
 faster-whisper>=1.0.3
 ffmpeg-python==0.2.0
+gradio==5.15.0
 yt-dlp
 json5
+torch==2.5.0+cu124; sys_platform != 'darwin'
+torchaudio==2.5.0+cu124; sys_platform != 'darwin'
+torch==2.5.0; sys_platform == 'darwin'
+torchaudio==2.5.0; sys_platform == 'darwin'
 more_itertools
 zhconv
 sentencepiece

src/config.py CHANGED Viewed

@@ -58,7 +58,7 @@ class ApplicationConfig:
                  vad_parallel_devices: str = "", vad_cpu_cores: int = 1, vad_process_timeout: int = 1800,
                  auto_parallel: bool = False, output_dir: str = None,
                  model_dir: str = None, device: str = None,
-                 verbose: bool = True, task: str = "transcribe", language: str = None,
                  vad_initial_prompt_mode: str = "prepend_first_segment ",
                  vad_merge_window: float = 5, vad_max_merge_size: float = 30,
                  vad_padding: float = 1, vad_prompt_window: float = 3,

                  vad_parallel_devices: str = "", vad_cpu_cores: int = 1, vad_process_timeout: int = 1800,
                  auto_parallel: bool = False, output_dir: str = None,
                  model_dir: str = None, device: str = None,
+                 verbose: bool = False, task: str = "transcribe", language: str = None,
                  vad_initial_prompt_mode: str = "prepend_first_segment ",
                  vad_merge_window: float = 5, vad_max_merge_size: float = 30,
                  vad_padding: float = 1, vad_prompt_window: float = 3,

src/vad.py CHANGED Viewed

@@ -14,12 +14,12 @@ from src.modelCache import GLOBAL_MODEL_CACHE, ModelCache
 from src.segments import merge_timestamps
 from src.whisper.abstractWhisperContainer import AbstractWhisperCallback
-# Workaround for https://github.com/tensorflow/tensorflow/issues/48797
-try:
-    import tensorflow as tf
-except ModuleNotFoundError:
-    # Error handling
-    pass
 import torch
@@ -621,7 +621,7 @@ class VadSileroTranscription(AbstractTranscription):
             chunk_start += chunk_duration
         perf_end_time = time.perf_counter()
-        print("VAD processing took {} seconds".format(perf_end_time - perf_start_time))
         return result

 from src.segments import merge_timestamps
 from src.whisper.abstractWhisperContainer import AbstractWhisperCallback
+# # Workaround for https://github.com/tensorflow/tensorflow/issues/48797
+# try:
+#     import tensorflow as tf
+# except ModuleNotFoundError:
+#     # Error handling
+#     pass
 import torch
             chunk_start += chunk_duration
         perf_end_time = time.perf_counter()
+        print(f"VAD processing took {perf_end_time - perf_start_time} seconds, from {start_time} to {end_time}")
         return result

src/vadParallel.py CHANGED Viewed

@@ -108,6 +108,24 @@ class ParallelTranscription(AbstractTranscription):
     def transcribe_parallel(self, transcription: AbstractTranscription, audio: str, whisperCallable: AbstractWhisperCallback, config: TranscriptionConfig,
                             cpu_device_count: int, gpu_devices: List[str], cpu_parallel_context: ParallelContext = None, gpu_parallel_context: ParallelContext = None,
                             progress_listener: ProgressListener = None):
         total_duration = get_audio_duration(audio)
         # First, get the timestamps for the original audio
@@ -212,6 +230,20 @@ class ParallelTranscription(AbstractTranscription):
     def _get_merged_timestamps_parallel(self, transcription: AbstractTranscription, audio: str, config: TranscriptionConfig, total_duration: float,
                                        cpu_device_count: int, cpu_parallel_context: ParallelContext = None):
         parameters = []
         chunk_size = max(total_duration / cpu_device_count, self.MIN_CPU_CHUNK_SIZE_SECONDS)
@@ -228,8 +260,7 @@ class ParallelTranscription(AbstractTranscription):
                 # No need to process chunks that are less than 1 second
                 break
-            print("Parallel VAD: Executing chunk from " + str(chunk_start) + " to " +
-                    str(chunk_end) + " on CPU device " + str(cpu_device_id))
             parameters.append([audio, config, chunk_start, chunk_end]);
             cpu_device_id += 1
@@ -258,7 +289,7 @@ class ParallelTranscription(AbstractTranscription):
             merged = transcription.get_merged_timestamps(timestamps, config, total_duration)
             perf_end_time = time.perf_counter()
-            print("Parallel VAD processing took {} seconds".format(perf_end_time - perf_start_time))
             return merged
         finally:
@@ -273,6 +304,17 @@ class ParallelTranscription(AbstractTranscription):
         return []
     def get_merged_timestamps(self,  timestamps: List[Dict[str, Any]], config: ParallelTranscriptionConfig, total_duration: float):
         # Override timestamps that will be processed
         if (config.override_timestamps is not None):
             print("(get_merged_timestamps) Using override timestamps of size " + str(len(config.override_timestamps)))
@@ -281,6 +323,18 @@ class ParallelTranscription(AbstractTranscription):
     def transcribe(self, audio: str, whisperCallable: AbstractWhisperCallback, config: ParallelTranscriptionConfig,
                    progressListener: ProgressListener = None):
         # Override device ID the first time
         if (os.environ.get("INITIALIZED", None) is None):
             os.environ["INITIALIZED"] = "1"
@@ -294,7 +348,15 @@ class ParallelTranscription(AbstractTranscription):
         return super().transcribe(audio, whisperCallable, config, progressListener)
     def _split(self, a, n):
-        """Split a list into n approximately equal parts."""
         k, m = divmod(len(a), n)
         return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n))

     def transcribe_parallel(self, transcription: AbstractTranscription, audio: str, whisperCallable: AbstractWhisperCallback, config: TranscriptionConfig,
                             cpu_device_count: int, gpu_devices: List[str], cpu_parallel_context: ParallelContext = None, gpu_parallel_context: ParallelContext = None,
                             progress_listener: ProgressListener = None):
+        """
+        Perform parallel transcription of an audio file using CPU and GPU.
+        Args:
+            transcription (AbstractTranscription): The transcription instance handling processing.
+            audio (str): Path to the audio file to be transcribed.
+            whisperCallable (AbstractWhisperCallback): Callback to interact with the Whisper model.
+            config (TranscriptionConfig): Configuration for transcription settings.
+            cpu_device_count (int): Number of CPU devices to use for processing.
+            gpu_devices (List[str]): List of GPU device IDs to use for processing.
+            cpu_parallel_context (ParallelContext, optional): Context for managing CPU parallel execution.
+            gpu_parallel_context (ParallelContext, optional): Context for managing GPU parallel execution.
+            progress_listener (ProgressListener, optional): Listener for tracking transcription progress.
+        Returns:
+            dict: Merged transcription results containing text, segments, and detected language.
+        """
         total_duration = get_audio_duration(audio)
         # First, get the timestamps for the original audio
     def _get_merged_timestamps_parallel(self, transcription: AbstractTranscription, audio: str, config: TranscriptionConfig, total_duration: float,
                                        cpu_device_count: int, cpu_parallel_context: ParallelContext = None):
+        """
+        Compute merged timestamps for transcription in parallel using CPU.
+        Args:
+            transcription (AbstractTranscription): The transcription instance handling timestamp processing.
+            audio (str): Path to the audio file.
+            config (TranscriptionConfig): Configuration settings for timestamp processing.
+            total_duration (float): Total duration of the audio file in seconds.
+            cpu_device_count (int): Number of CPU devices to use.
+            cpu_parallel_context (ParallelContext, optional): Context for managing CPU parallel execution.
+        Returns:
+            list: Merged timestamps after processing.
+        """
         parameters = []
         chunk_size = max(total_duration / cpu_device_count, self.MIN_CPU_CHUNK_SIZE_SECONDS)
                 # No need to process chunks that are less than 1 second
                 break
+            print(f"Parallel VAD: Executing chunk from {chunk_start} to {chunk_end} on CPU device {cpu_device_id}")
             parameters.append([audio, config, chunk_start, chunk_end]);
             cpu_device_id += 1
             merged = transcription.get_merged_timestamps(timestamps, config, total_duration)
             perf_end_time = time.perf_counter()
+            print(f"Parallel VAD processing took {perf_end_time - perf_start_time} seconds")
             return merged
         finally:
         return []
     def get_merged_timestamps(self,  timestamps: List[Dict[str, Any]], config: ParallelTranscriptionConfig, total_duration: float):
+        """
+        Merge timestamps from different transcription segments.
+        Args:
+            timestamps (List[Dict[str, Any]]): List of timestamp dictionaries from different segments.
+            config (ParallelTranscriptionConfig): Configuration settings for merging timestamps.
+            total_duration (float): Total duration of the audio file in seconds.
+        Returns:
+            list: Merged timestamps after processing.
+        """
         # Override timestamps that will be processed
         if (config.override_timestamps is not None):
             print("(get_merged_timestamps) Using override timestamps of size " + str(len(config.override_timestamps)))
     def transcribe(self, audio: str, whisperCallable: AbstractWhisperCallback, config: ParallelTranscriptionConfig,
                    progressListener: ProgressListener = None):
+        """
+        Perform transcription on a given audio file using the specified device.
+        Args:
+            audio (str): Path to the audio file to be transcribed.
+            whisperCallable (AbstractWhisperCallback): Callback to interact with the Whisper model.
+            config (ParallelTranscriptionConfig): Configuration settings for transcription.
+            progressListener (ProgressListener, optional): Listener for tracking transcription progress.
+        Returns:
+            dict: Transcription results including text, segments, and detected language.
+        """
         # Override device ID the first time
         if (os.environ.get("INITIALIZED", None) is None):
             os.environ["INITIALIZED"] = "1"
         return super().transcribe(audio, whisperCallable, config, progressListener)
     def _split(self, a, n):
+        """Split a list into n approximately equal parts.
+        Args:
+            a (List[Any]): The list to be split.
+            n (int): The number of parts to split the list into.
+        Returns:
+            generator: A generator yielding n sublists.
+        """
         k, m = divmod(len(a), n)
         return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n))