Spaces:

jhj0517
/

Whisper-WebUI

Running

App Files Files Community

jhj0517 commited on Sep 19, 2024

Commit

602c60a

2 Parent(s): 47a36e3 633c360

Merge from master

Browse files

Files changed (16) hide show

app.py +63 -10
configs/default_parameters.yaml +6 -0
docker-compose.yaml +0 -2
modules/ui/htmls.py +1 -1
modules/utils/cli_manager.py +12 -0
modules/utils/files_manager.py +7 -1
modules/utils/paths.py +8 -1
modules/uvr/music_separator.py +183 -0
modules/whisper/faster_whisper_inference.py +3 -1
modules/whisper/insanely_fast_whisper_inference.py +4 -2
modules/whisper/whisper_Inference.py +4 -2
modules/whisper/whisper_base.py +72 -3
modules/whisper/whisper_factory.py +12 -5
modules/whisper/whisper_parameter.py +48 -17
notebook/whisper-webui.ipynb +3 -2
requirements.txt +3 -1

app.py CHANGED Viewed

@@ -4,13 +4,15 @@ import gradio as gr
 import yaml
 from modules.utils.paths import (FASTER_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR, WHISPER_MODELS_DIR,
-                                 INSANELY_FAST_WHISPER_MODELS_DIR, NLLB_MODELS_DIR, DEFAULT_PARAMETERS_CONFIG_PATH)
 from modules.utils.files_manager import load_yaml
 from modules.whisper.whisper_factory import WhisperFactory
 from modules.whisper.faster_whisper_inference import FasterWhisperInference
 from modules.whisper.insanely_fast_whisper_inference import InsanelyFastWhisperInference
 from modules.translation.nllb_inference import NLLBInference
 from modules.ui.htmls import *
 from modules.utils.youtube_manager import get_ytmetas
 from modules.translation.deepl_api import DeepLAPI
 from modules.whisper.whisper_parameter import *
@@ -25,10 +27,9 @@ class App:
             whisper_model_dir=self.args.whisper_model_dir,
             faster_whisper_model_dir=self.args.faster_whisper_model_dir,
             insanely_fast_whisper_model_dir=self.args.insanely_fast_whisper_model_dir,
             output_dir=self.args.output_dir,
         )
-        print(f"Use \"{self.args.whisper_type}\" implementation")
-        print(f"Device \"{self.whisper_inf.device}\" is detected")
         self.nllb_inf = NLLBInference(
             model_dir=self.args.nllb_model_dir,
             output_dir=os.path.join(self.args.output_dir, "translations")
@@ -37,11 +38,14 @@ class App:
             output_dir=os.path.join(self.args.output_dir, "translations")
         )
         self.default_params = load_yaml(DEFAULT_PARAMETERS_CONFIG_PATH)
     def create_whisper_parameters(self):
         whisper_params = self.default_params["whisper"]
         vad_params = self.default_params["vad"]
         diarization_params = self.default_params["diarization"]
         with gr.Row():
             dd_model = gr.Dropdown(choices=self.whisper_inf.available_models, value=whisper_params["model_size"],
@@ -127,6 +131,16 @@ class App:
                                               precision=0)
                 nb_batch_size = gr.Number(label="Batch Size", value=whisper_params["batch_size"], precision=0)
         with gr.Accordion("VAD", open=False):
             cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=vad_params["vad_filter"],
                                         interactive=True)
@@ -173,7 +187,9 @@ class App:
                 hallucination_silence_threshold=nb_hallucination_silence_threshold, hotwords=tb_hotwords,
                 language_detection_threshold=nb_language_detection_threshold,
                 language_detection_segments=nb_language_detection_segments,
-                prompt_reset_on_temperature=sld_prompt_reset_on_temperature
             ),
             dd_file_format,
             cb_timestamp
@@ -183,6 +199,7 @@ class App:
         translation_params = self.default_params["translation"]
         deepl_params = translation_params["deepl"]
         nllb_params = translation_params["nllb"]
         with self.app:
             with gr.Row():
@@ -254,7 +271,7 @@ class App:
                         files_subtitles = gr.Files(label="Downloadable output file", scale=3)
                         btn_openfolder = gr.Button('📂', scale=1)
-                    params = [mic_input, dd_file_format]
                     btn_run.click(fn=self.whisper_inf.transcribe_mic,
                                   inputs=params + whisper_params.as_list(),
@@ -328,6 +345,39 @@ class App:
                                          inputs=None,
                                          outputs=None)
         # Launch the app with optional gradio settings
         args = self.args
@@ -347,7 +397,8 @@ class App:
         if os.path.exists(folder_path):
             os.system(f"start {folder_path}")
         else:
-            print(f"The folder {folder_path} does not exist.")
     @staticmethod
     def on_change_models(model_size: str):
@@ -362,16 +413,16 @@ class App:
 parser = argparse.ArgumentParser()
 parser.add_argument('--whisper_type', type=str, default="faster-whisper",
                     help='A type of the whisper implementation between: ["whisper", "faster-whisper", "insanely-fast-whisper"]')
-parser.add_argument('--share', type=bool, default=False, nargs='?', const=True, help='Gradio share value')
 parser.add_argument('--server_name', type=str, default=None, help='Gradio server host')
 parser.add_argument('--server_port', type=int, default=None, help='Gradio server port')
 parser.add_argument('--root_path', type=str, default=None, help='Gradio root path')
 parser.add_argument('--username', type=str, default=None, help='Gradio authentication username')
 parser.add_argument('--password', type=str, default=None, help='Gradio authentication password')
 parser.add_argument('--theme', type=str, default=None, help='Gradio Blocks theme')
-parser.add_argument('--colab', type=bool, default=False, nargs='?', const=True, help='Is colab user or not')
-parser.add_argument('--api_open', type=bool, default=False, nargs='?', const=True, help='Enable api or not in Gradio')
-parser.add_argument('--inbrowser', type=bool, default=True, nargs='?', const=True, help='Whether to automatically start Gradio app or not')
 parser.add_argument('--whisper_model_dir', type=str, default=WHISPER_MODELS_DIR,
                     help='Directory path of the whisper model')
 parser.add_argument('--faster_whisper_model_dir', type=str, default=FASTER_WHISPER_MODELS_DIR,
@@ -383,6 +434,8 @@ parser.add_argument('--diarization_model_dir', type=str, default=DIARIZATION_MOD
                     help='Directory path of the diarization model')
 parser.add_argument('--nllb_model_dir', type=str, default=NLLB_MODELS_DIR,
                     help='Directory path of the Facebook NLLB model')
 parser.add_argument('--output_dir', type=str, default=OUTPUT_DIR, help='Directory path of the outputs')
 _args = parser.parse_args()

 import yaml
 from modules.utils.paths import (FASTER_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR, WHISPER_MODELS_DIR,
+                                 INSANELY_FAST_WHISPER_MODELS_DIR, NLLB_MODELS_DIR, DEFAULT_PARAMETERS_CONFIG_PATH,
+                                 UVR_MODELS_DIR)
 from modules.utils.files_manager import load_yaml
 from modules.whisper.whisper_factory import WhisperFactory
 from modules.whisper.faster_whisper_inference import FasterWhisperInference
 from modules.whisper.insanely_fast_whisper_inference import InsanelyFastWhisperInference
 from modules.translation.nllb_inference import NLLBInference
 from modules.ui.htmls import *
+from modules.utils.cli_manager import str2bool
 from modules.utils.youtube_manager import get_ytmetas
 from modules.translation.deepl_api import DeepLAPI
 from modules.whisper.whisper_parameter import *
             whisper_model_dir=self.args.whisper_model_dir,
             faster_whisper_model_dir=self.args.faster_whisper_model_dir,
             insanely_fast_whisper_model_dir=self.args.insanely_fast_whisper_model_dir,
+            uvr_model_dir=self.args.uvr_model_dir,
             output_dir=self.args.output_dir,
         )
         self.nllb_inf = NLLBInference(
             model_dir=self.args.nllb_model_dir,
             output_dir=os.path.join(self.args.output_dir, "translations")
             output_dir=os.path.join(self.args.output_dir, "translations")
         )
         self.default_params = load_yaml(DEFAULT_PARAMETERS_CONFIG_PATH)
+        print(f"Use \"{self.args.whisper_type}\" implementation")
+        print(f"Device \"{self.whisper_inf.device}\" is detected")
     def create_whisper_parameters(self):
         whisper_params = self.default_params["whisper"]
         vad_params = self.default_params["vad"]
         diarization_params = self.default_params["diarization"]
+        uvr_params = self.default_params["bgm_separation"]
         with gr.Row():
             dd_model = gr.Dropdown(choices=self.whisper_inf.available_models, value=whisper_params["model_size"],
                                               precision=0)
                 nb_batch_size = gr.Number(label="Batch Size", value=whisper_params["batch_size"], precision=0)
+        with gr.Accordion("BGM Separation", open=False):
+            cb_bgm_separation = gr.Checkbox(label="Enable BGM Separation Filter", value=uvr_params["is_separate_bgm"],
+                                            interactive=True)
+            dd_uvr_device = gr.Dropdown(label="Device", value=self.whisper_inf.music_separator.device,
+                                        choices=self.whisper_inf.music_separator.available_devices)
+            dd_uvr_model_size = gr.Dropdown(label="Model", value=uvr_params["model_size"],
+                                            choices=self.whisper_inf.music_separator.available_models)
+            nb_uvr_segment_size = gr.Number(label="Segment Size", value=uvr_params["segment_size"], precision=0)
+            cb_uvr_save_file = gr.Checkbox(label="Save separated files to output", value=uvr_params["save_file"])
         with gr.Accordion("VAD", open=False):
             cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=vad_params["vad_filter"],
                                         interactive=True)
                 hallucination_silence_threshold=nb_hallucination_silence_threshold, hotwords=tb_hotwords,
                 language_detection_threshold=nb_language_detection_threshold,
                 language_detection_segments=nb_language_detection_segments,
+                prompt_reset_on_temperature=sld_prompt_reset_on_temperature, is_bgm_separate=cb_bgm_separation,
+                uvr_device=dd_uvr_device, uvr_model_size=dd_uvr_model_size, uvr_segment_size=nb_uvr_segment_size,
+                uvr_save_file=cb_uvr_save_file
             ),
             dd_file_format,
             cb_timestamp
         translation_params = self.default_params["translation"]
         deepl_params = translation_params["deepl"]
         nllb_params = translation_params["nllb"]
+        uvr_params = self.default_params["bgm_separation"]
         with self.app:
             with gr.Row():
                         files_subtitles = gr.Files(label="Downloadable output file", scale=3)
                         btn_openfolder = gr.Button('📂', scale=1)
+                    params = [mic_input, dd_file_format, cb_timestamp]
                     btn_run.click(fn=self.whisper_inf.transcribe_mic,
                                   inputs=params + whisper_params.as_list(),
                                          inputs=None,
                                          outputs=None)
+                with gr.TabItem("BGM Separation"):
+                    files_audio = gr.Files(type="filepath", label="Upload Audio Files to separate background music")
+                    dd_uvr_device = gr.Dropdown(label="Device", value=self.whisper_inf.music_separator.device,
+                                                choices=self.whisper_inf.music_separator.available_devices)
+                    dd_uvr_model_size = gr.Dropdown(label="Model", value=uvr_params["model_size"],
+                                                    choices=self.whisper_inf.music_separator.available_models)
+                    nb_uvr_segment_size = gr.Number(label="Segment Size", value=uvr_params["segment_size"], precision=0)
+                    cb_uvr_save_file = gr.Checkbox(label="Save separated files to output",
+                                                   value=True, visible=False)
+                    btn_run = gr.Button("SEPARATE BACKGROUND MUSIC", variant="primary")
+                    with gr.Column():
+                        with gr.Row():
+                            ad_instrumental = gr.Audio(label="Instrumental", scale=8)
+                            btn_open_instrumental_folder = gr.Button('📂', scale=1)
+                        with gr.Row():
+                            ad_vocals = gr.Audio(label="Vocals", scale=8)
+                            btn_open_vocals_folder = gr.Button('📂', scale=1)
+                    btn_run.click(fn=self.whisper_inf.music_separator.separate_files,
+                                  inputs=[files_audio, dd_uvr_model_size, dd_uvr_device, nb_uvr_segment_size,
+                                          cb_uvr_save_file],
+                                  outputs=[ad_instrumental, ad_vocals])
+                    btn_open_instrumental_folder.click(inputs=None,
+                                                       outputs=None,
+                                                       fn=lambda: self.open_folder(os.path.join(
+                                                           self.args.output_dir, "UVR", "instrumental"
+                                                       )))
+                    btn_open_vocals_folder.click(inputs=None,
+                                                 outputs=None,
+                                                 fn=lambda: self.open_folder(os.path.join(
+                                                    self.args.output_dir, "UVR", "vocals"
+                                                 )))
         # Launch the app with optional gradio settings
         args = self.args
         if os.path.exists(folder_path):
             os.system(f"start {folder_path}")
         else:
+            os.makedirs(folder_path, exist_ok=True)
+            print(f"The directory path {folder_path} has newly created.")
     @staticmethod
     def on_change_models(model_size: str):
 parser = argparse.ArgumentParser()
 parser.add_argument('--whisper_type', type=str, default="faster-whisper",
                     help='A type of the whisper implementation between: ["whisper", "faster-whisper", "insanely-fast-whisper"]')
+parser.add_argument('--share', type=str2bool, default=False, nargs='?', const=True, help='Gradio share value')
 parser.add_argument('--server_name', type=str, default=None, help='Gradio server host')
 parser.add_argument('--server_port', type=int, default=None, help='Gradio server port')
 parser.add_argument('--root_path', type=str, default=None, help='Gradio root path')
 parser.add_argument('--username', type=str, default=None, help='Gradio authentication username')
 parser.add_argument('--password', type=str, default=None, help='Gradio authentication password')
 parser.add_argument('--theme', type=str, default=None, help='Gradio Blocks theme')
+parser.add_argument('--colab', type=str2bool, default=False, nargs='?', const=True, help='Is colab user or not')
+parser.add_argument('--api_open', type=str2bool, default=False, nargs='?', const=True, help='Enable api or not in Gradio')
+parser.add_argument('--inbrowser', type=str2bool, default=True, nargs='?', const=True, help='Whether to automatically start Gradio app or not')
 parser.add_argument('--whisper_model_dir', type=str, default=WHISPER_MODELS_DIR,
                     help='Directory path of the whisper model')
 parser.add_argument('--faster_whisper_model_dir', type=str, default=FASTER_WHISPER_MODELS_DIR,
                     help='Directory path of the diarization model')
 parser.add_argument('--nllb_model_dir', type=str, default=NLLB_MODELS_DIR,
                     help='Directory path of the Facebook NLLB model')
+parser.add_argument('--uvr_model_dir', type=str, default=UVR_MODELS_DIR,
+                    help='Directory path of the UVR model')
 parser.add_argument('--output_dir', type=str, default=OUTPUT_DIR, help='Directory path of the outputs')
 _args = parser.parse_args()

configs/default_parameters.yaml CHANGED Viewed

@@ -44,6 +44,12 @@ diarization:
   is_diarize: false
   hf_token: ""
 translation:
   deepl:
     api_key: ""

   is_diarize: false
   hf_token: ""
+bgm_separation:
+  is_separate_bgm: false
+  model_size: "UVR-MDX-NET-Inst_HQ_4"
+  segment_size: 256
+  save_file: false
 translation:
   deepl:
     api_key: ""

docker-compose.yaml CHANGED Viewed

@@ -1,5 +1,3 @@
-version: '3.8'
 services:
   app:
     build: .

 services:
   app:
     build: .

modules/ui/htmls.py CHANGED Viewed

@@ -38,7 +38,7 @@ CSS = """
 """
 MARKDOWN = """
-### [Whisper Web-UI](https://github.com/jhj0517/Whsiper-WebUI)
 """

 """
 MARKDOWN = """
+### [Whisper-WebUI](https://github.com/jhj0517/Whsiper-WebUI)
 """

modules/utils/cli_manager.py ADDED Viewed

	@@ -0,0 +1,12 @@

+import argparse
+def str2bool(v):
+    if isinstance(v, bool):
+        return v
+    if v.lower() in ('yes', 'true', 't', 'y', '1'):
+        return True
+    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
+        return False
+    else:
+        raise argparse.ArgumentTypeError('Boolean value expected.')

modules/utils/files_manager.py CHANGED Viewed

@@ -29,7 +29,8 @@ def save_yaml(data: dict, path: str = DEFAULT_PARAMETERS_CONFIG_PATH):
 def get_media_files(folder_path, include_sub_directory=False):
-    video_extensions = ['*.mp4', '*.mkv', '*.flv', '*.avi', '*.mov', '*.wmv']
     audio_extensions = ['*.mp3', '*.wav', '*.aac', '*.flac', '*.ogg', '*.m4a']
     media_extensions = video_extensions + audio_extensions
@@ -61,3 +62,8 @@ def format_gradio_files(files: list):
         gradio_files.append(NamedString(file))
     return gradio_files

 def get_media_files(folder_path, include_sub_directory=False):
+    video_extensions = ['*.mp4', '*.mkv', '*.flv', '*.avi', '*.mov', '*.wmv', '*.webm', '*.m4v', '*.mpeg', '*.mpg',
+                        '*.3gp', '*.f4v', '*.ogv', '*.vob', '*.mts', '*.m2ts', '*.divx', '*.mxf', '*.rm', '*.rmvb']
     audio_extensions = ['*.mp3', '*.wav', '*.aac', '*.flac', '*.ogg', '*.m4a']
     media_extensions = video_extensions + audio_extensions
         gradio_files.append(NamedString(file))
     return gradio_files
+def is_video(file_path):
+    video_extensions = ['.mp4', '.mkv', '.avi', '.mov', '.flv', '.wmv', '.webm', '.m4v', '.mpeg', '.mpg', '.3gp']
+    extension = os.path.splitext(file_path)[1].lower()
+    return extension in video_extensions

modules/utils/paths.py CHANGED Viewed

@@ -7,10 +7,14 @@ FASTER_WHISPER_MODELS_DIR = os.path.join(WHISPER_MODELS_DIR, "faster-whisper")
 INSANELY_FAST_WHISPER_MODELS_DIR = os.path.join(WHISPER_MODELS_DIR, "insanely-fast-whisper")
 NLLB_MODELS_DIR = os.path.join(MODELS_DIR, "NLLB")
 DIARIZATION_MODELS_DIR = os.path.join(MODELS_DIR, "Diarization")
 CONFIGS_DIR = os.path.join(WEBUI_DIR, "configs")
 DEFAULT_PARAMETERS_CONFIG_PATH = os.path.join(CONFIGS_DIR, "default_parameters.yaml")
 OUTPUT_DIR = os.path.join(WEBUI_DIR, "outputs")
 TRANSLATION_OUTPUT_DIR = os.path.join(OUTPUT_DIR, "translations")
 for dir_path in [MODELS_DIR,
                  WHISPER_MODELS_DIR,
@@ -18,7 +22,10 @@ for dir_path in [MODELS_DIR,
                  INSANELY_FAST_WHISPER_MODELS_DIR,
                  NLLB_MODELS_DIR,
                  DIARIZATION_MODELS_DIR,
                  CONFIGS_DIR,
                  OUTPUT_DIR,
-                 TRANSLATION_OUTPUT_DIR]:
     os.makedirs(dir_path, exist_ok=True)

 INSANELY_FAST_WHISPER_MODELS_DIR = os.path.join(WHISPER_MODELS_DIR, "insanely-fast-whisper")
 NLLB_MODELS_DIR = os.path.join(MODELS_DIR, "NLLB")
 DIARIZATION_MODELS_DIR = os.path.join(MODELS_DIR, "Diarization")
+UVR_MODELS_DIR = os.path.join(MODELS_DIR, "UVR", "MDX_Net_Models")
 CONFIGS_DIR = os.path.join(WEBUI_DIR, "configs")
 DEFAULT_PARAMETERS_CONFIG_PATH = os.path.join(CONFIGS_DIR, "default_parameters.yaml")
 OUTPUT_DIR = os.path.join(WEBUI_DIR, "outputs")
 TRANSLATION_OUTPUT_DIR = os.path.join(OUTPUT_DIR, "translations")
+UVR_OUTPUT_DIR = os.path.join(OUTPUT_DIR, "UVR")
+UVR_INSTRUMENTAL_OUTPUT_DIR = os.path.join(UVR_OUTPUT_DIR, "instrumental")
+UVR_VOCALS_OUTPUT_DIR = os.path.join(UVR_OUTPUT_DIR, "vocals")
 for dir_path in [MODELS_DIR,
                  WHISPER_MODELS_DIR,
                  INSANELY_FAST_WHISPER_MODELS_DIR,
                  NLLB_MODELS_DIR,
                  DIARIZATION_MODELS_DIR,
+                 UVR_MODELS_DIR,
                  CONFIGS_DIR,
                  OUTPUT_DIR,
+                 TRANSLATION_OUTPUT_DIR,
+                 UVR_INSTRUMENTAL_OUTPUT_DIR,
+                 UVR_VOCALS_OUTPUT_DIR]:
     os.makedirs(dir_path, exist_ok=True)

modules/uvr/music_separator.py ADDED Viewed

	@@ -0,0 +1,183 @@

+from typing import Optional, Union, List, Dict
+import numpy as np
+import torchaudio
+import soundfile as sf
+import os
+import torch
+import gc
+import gradio as gr
+from datetime import datetime
+from uvr.models import MDX, Demucs, VrNetwork, MDXC
+from modules.utils.paths import DEFAULT_PARAMETERS_CONFIG_PATH
+from modules.utils.files_manager import load_yaml, save_yaml, is_video
+from modules.diarize.audio_loader import load_audio
+class MusicSeparator:
+    def __init__(self,
+                 model_dir: Optional[str] = None,
+                 output_dir: Optional[str] = None):
+        self.model = None
+        self.device = self.get_device()
+        self.available_devices = ["cpu", "cuda"]
+        self.model_dir = model_dir
+        self.output_dir = output_dir
+        instrumental_output_dir = os.path.join(self.output_dir, "instrumental")
+        vocals_output_dir = os.path.join(self.output_dir, "vocals")
+        os.makedirs(instrumental_output_dir, exist_ok=True)
+        os.makedirs(vocals_output_dir, exist_ok=True)
+        self.audio_info = None
+        self.available_models = ["UVR-MDX-NET-Inst_HQ_4", "UVR-MDX-NET-Inst_3"]
+        self.default_model = self.available_models[0]
+        self.current_model_size = self.default_model
+        self.model_config = {
+            "segment": 256,
+            "split": True
+        }
+    def update_model(self,
+                     model_name: str = "UVR-MDX-NET-Inst_1",
+                     device: Optional[str] = None,
+                     segment_size: int = 256):
+        """
+        Update model with the given model name
+        Args:
+            model_name (str): Model name.
+            device (str): Device to use for the model.
+            segment_size (int): Segment size for the prediction.
+        """
+        if device is None:
+            device = self.device
+        self.device = device
+        self.model_config = {
+            "segment": segment_size,
+            "split": True
+        }
+        self.model = MDX(name=model_name,
+                         other_metadata=self.model_config,
+                         device=self.device,
+                         logger=None,
+                         model_dir=self.model_dir)
+    def separate(self,
+                 audio: Union[str, np.ndarray],
+                 model_name: str,
+                 device: Optional[str] = None,
+                 segment_size: int = 256,
+                 save_file: bool = False,
+                 progress: gr.Progress = gr.Progress()) -> tuple[np.ndarray, np.ndarray, List]:
+        """
+        Separate the background music from the audio.
+        Args:
+            audio (Union[str, np.ndarray]): Audio path or numpy array.
+            model_name (str): Model name.
+            device (str): Device to use for the model.
+            segment_size (int): Segment size for the prediction.
+            save_file (bool): Whether to save the separated audio to output path or not.
+            progress (gr.Progress): Gradio progress indicator.
+        Returns:
+            A Tuple of
+            np.ndarray: Instrumental numpy arrays.
+            np.ndarray: Vocals numpy arrays.
+            file_paths: List of file paths where the separated audio is saved. Return empty when save_file is False.
+        """
+        if isinstance(audio, str):
+            output_filename, ext = os.path.basename(audio), ".wav"
+            output_filename, orig_ext = os.path.splitext(output_filename)
+            if is_video(audio):
+                audio = load_audio(audio)
+                sample_rate = 16000
+            else:
+                self.audio_info = torchaudio.info(audio)
+                sample_rate = self.audio_info.sample_rate
+        else:
+            timestamp = datetime.now().strftime("%m%d%H%M%S")
+            output_filename, ext = f"UVR-{timestamp}", ".wav"
+            sample_rate = 16000
+        model_config = {
+            "segment": segment_size,
+            "split": True
+        }
+        if (self.model is None or
+                self.current_model_size != model_name or
+                self.model_config != model_config or
+                self.model.sample_rate != sample_rate or
+                self.device != device):
+            progress(0, desc="Initializing UVR Model..")
+            self.update_model(
+                model_name=model_name,
+                device=device,
+                segment_size=segment_size
+            )
+            self.model.sample_rate = sample_rate
+        progress(0, desc="Separating background music from the audio..")
+        result = self.model(audio)
+        instrumental, vocals = result["instrumental"].T, result["vocals"].T
+        file_paths = []
+        if save_file:
+            instrumental_output_path = os.path.join(self.output_dir, "instrumental", f"{output_filename}-instrumental{ext}")
+            vocals_output_path = os.path.join(self.output_dir, "vocals", f"{output_filename}-vocals{ext}")
+            sf.write(instrumental_output_path, instrumental, sample_rate, format="WAV")
+            sf.write(vocals_output_path, vocals, sample_rate, format="WAV")
+            file_paths += [instrumental_output_path, vocals_output_path]
+        return instrumental, vocals, file_paths
+    def separate_files(self,
+                       files: List,
+                       model_name: str,
+                       device: Optional[str] = None,
+                       segment_size: int = 256,
+                       save_file: bool = True,
+                       progress: gr.Progress = gr.Progress()) -> List[str]:
+        """Separate the background music from the audio files. Returns only last Instrumental and vocals file paths
+        to display into gr.Audio()"""
+        self.cache_parameters(model_size=model_name, segment_size=segment_size)
+        for file_path in files:
+            instrumental, vocals, file_paths = self.separate(
+                audio=file_path,
+                model_name=model_name,
+                device=device,
+                segment_size=segment_size,
+                save_file=save_file,
+                progress=progress
+            )
+        return file_paths
+    @staticmethod
+    def get_device():
+        """Get device for the model"""
+        return "cuda" if torch.cuda.is_available() else "cpu"
+    def offload(self):
+        """Offload the model and free up the memory"""
+        if self.model is not None:
+            del self.model
+            self.model = None
+        if self.device == "cuda":
+            torch.cuda.empty_cache()
+        gc.collect()
+        self.audio_info = None
+    @staticmethod
+    def cache_parameters(model_size: str,
+                         segment_size: int):
+        cached_params = load_yaml(DEFAULT_PARAMETERS_CONFIG_PATH)
+        cached_uvr_params = cached_params["bgm_separation"]
+        uvr_params_to_cache = {
+            "model_size": model_size,
+            "segment_size": segment_size
+        }
+        cached_uvr_params = {**cached_uvr_params, **uvr_params_to_cache}
+        cached_params["bgm_separation"] = cached_uvr_params
+        save_yaml(cached_params, DEFAULT_PARAMETERS_CONFIG_PATH)

modules/whisper/faster_whisper_inference.py CHANGED Viewed

@@ -11,7 +11,7 @@ import whisper
 import gradio as gr
 from argparse import Namespace
-from modules.utils.paths import (FASTER_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR)
 from modules.whisper.whisper_parameter import *
 from modules.whisper.whisper_base import WhisperBase
@@ -20,11 +20,13 @@ class FasterWhisperInference(WhisperBase):
     def __init__(self,
                  model_dir: str = FASTER_WHISPER_MODELS_DIR,
                  diarization_model_dir: str = DIARIZATION_MODELS_DIR,
                  output_dir: str = OUTPUT_DIR,
                  ):
         super().__init__(
             model_dir=model_dir,
             diarization_model_dir=diarization_model_dir,
             output_dir=output_dir
         )
         self.model_dir = model_dir

 import gradio as gr
 from argparse import Namespace
+from modules.utils.paths import (FASTER_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, UVR_MODELS_DIR, OUTPUT_DIR)
 from modules.whisper.whisper_parameter import *
 from modules.whisper.whisper_base import WhisperBase
     def __init__(self,
                  model_dir: str = FASTER_WHISPER_MODELS_DIR,
                  diarization_model_dir: str = DIARIZATION_MODELS_DIR,
+                 uvr_model_dir: str = UVR_MODELS_DIR,
                  output_dir: str = OUTPUT_DIR,
                  ):
         super().__init__(
             model_dir=model_dir,
             diarization_model_dir=diarization_model_dir,
+            uvr_model_dir=uvr_model_dir,
             output_dir=output_dir
         )
         self.model_dir = model_dir

modules/whisper/insanely_fast_whisper_inference.py CHANGED Viewed

@@ -11,7 +11,7 @@ import whisper
 from rich.progress import Progress, TimeElapsedColumn, BarColumn, TextColumn
 from argparse import Namespace
-from modules.utils.paths import (INSANELY_FAST_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR)
 from modules.whisper.whisper_parameter import *
 from modules.whisper.whisper_base import WhisperBase
@@ -20,12 +20,14 @@ class InsanelyFastWhisperInference(WhisperBase):
     def __init__(self,
                  model_dir: str = INSANELY_FAST_WHISPER_MODELS_DIR,
                  diarization_model_dir: str = DIARIZATION_MODELS_DIR,
                  output_dir: str = OUTPUT_DIR,
                  ):
         super().__init__(
             model_dir=model_dir,
             output_dir=output_dir,
-            diarization_model_dir=diarization_model_dir
         )
         self.model_dir = model_dir
         os.makedirs(self.model_dir, exist_ok=True)

 from rich.progress import Progress, TimeElapsedColumn, BarColumn, TextColumn
 from argparse import Namespace
+from modules.utils.paths import (INSANELY_FAST_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, UVR_MODELS_DIR, OUTPUT_DIR)
 from modules.whisper.whisper_parameter import *
 from modules.whisper.whisper_base import WhisperBase
     def __init__(self,
                  model_dir: str = INSANELY_FAST_WHISPER_MODELS_DIR,
                  diarization_model_dir: str = DIARIZATION_MODELS_DIR,
+                 uvr_model_dir: str = UVR_MODELS_DIR,
                  output_dir: str = OUTPUT_DIR,
                  ):
         super().__init__(
             model_dir=model_dir,
             output_dir=output_dir,
+            diarization_model_dir=diarization_model_dir,
+            uvr_model_dir=uvr_model_dir
         )
         self.model_dir = model_dir
         os.makedirs(self.model_dir, exist_ok=True)

modules/whisper/whisper_Inference.py CHANGED Viewed

@@ -7,7 +7,7 @@ import torch
 import os
 from argparse import Namespace
-from modules.utils.paths import (WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR)
 from modules.whisper.whisper_base import WhisperBase
 from modules.whisper.whisper_parameter import *
@@ -16,12 +16,14 @@ class WhisperInference(WhisperBase):
     def __init__(self,
                  model_dir: str = WHISPER_MODELS_DIR,
                  diarization_model_dir: str = DIARIZATION_MODELS_DIR,
                  output_dir: str = OUTPUT_DIR,
                  ):
         super().__init__(
             model_dir=model_dir,
             output_dir=output_dir,
-            diarization_model_dir=diarization_model_dir
         )
     def transcribe(self,

 import os
 from argparse import Namespace
+from modules.utils.paths import (WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR, UVR_MODELS_DIR)
 from modules.whisper.whisper_base import WhisperBase
 from modules.whisper.whisper_parameter import *
     def __init__(self,
                  model_dir: str = WHISPER_MODELS_DIR,
                  diarization_model_dir: str = DIARIZATION_MODELS_DIR,
+                 uvr_model_dir: str = UVR_MODELS_DIR,
                  output_dir: str = OUTPUT_DIR,
                  ):
         super().__init__(
             model_dir=model_dir,
             output_dir=output_dir,
+            diarization_model_dir=diarization_model_dir,
+            uvr_model_dir=uvr_model_dir
         )
     def transcribe(self,

modules/whisper/whisper_base.py CHANGED Viewed

@@ -2,6 +2,7 @@ import os
 import torch
 import whisper
 import gradio as gr
 from abc import ABC, abstractmethod
 from typing import BinaryIO, Union, Tuple, List
 import numpy as np
@@ -9,7 +10,9 @@ from datetime import datetime
 from faster_whisper.vad import VadOptions
 from dataclasses import astuple
-from modules.utils.paths import (WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR, DEFAULT_PARAMETERS_CONFIG_PATH)
 from modules.utils.subtitle_manager import get_srt, get_vtt, get_txt, write_file, safe_filename
 from modules.utils.youtube_manager import get_ytdata, get_ytaudio
 from modules.utils.files_manager import get_media_files, format_gradio_files, load_yaml, save_yaml
@@ -22,6 +25,7 @@ class WhisperBase(ABC):
     def __init__(self,
                  model_dir: str = WHISPER_MODELS_DIR,
                  diarization_model_dir: str = DIARIZATION_MODELS_DIR,
                  output_dir: str = OUTPUT_DIR,
                  ):
         self.model_dir = model_dir
@@ -32,6 +36,10 @@ class WhisperBase(ABC):
             model_dir=diarization_model_dir
         )
         self.vad = SileroVAD()
         self.model = None
         self.current_model_size = None
@@ -102,7 +110,26 @@ class WhisperBase(ABC):
             language_code_dict = {value: key for key, value in whisper.tokenizer.LANGUAGES.items()}
             params.lang = language_code_dict[params.lang]
-        speech_chunks = None
         if params.vad_filter:
             # Explicit value set for float('inf') from gr.Number()
             if params.max_speech_duration_s >= 9999:
@@ -224,6 +251,7 @@ class WhisperBase(ABC):
     def transcribe_mic(self,
                        mic_audio: str,
                        file_format: str,
                        progress=gr.Progress(),
                        *whisper_params,
                        ) -> list:
@@ -236,6 +264,8 @@ class WhisperBase(ABC):
             Audio file path from gr.Microphone()
         file_format: str
             Subtitle File format to write from gr.Dropdown(). Supported format: [SRT, WebVTT, txt]
         progress: gr.Progress
             Indicator to show progress directly in gradio.
         *whisper_params: tuple
@@ -253,6 +283,7 @@ class WhisperBase(ABC):
             transcribed_segments, time_for_task = self.run(
                 mic_audio,
                 progress,
                 *whisper_params,
             )
             progress(1, desc="Completed!")
@@ -260,7 +291,7 @@ class WhisperBase(ABC):
             subtitle, result_file_path = self.generate_and_write_file(
                 file_name="Mic",
                 transcribed_segments=transcribed_segments,
-                add_timestamp=True,
                 file_format=file_format,
                 output_dir=self.output_dir
             )
@@ -427,18 +458,40 @@ class WhisperBase(ABC):
         if torch.cuda.is_available():
             return "cuda"
         elif torch.backends.mps.is_available():
             return "mps"
         else:
             return "cpu"
     @staticmethod
     def release_cuda_memory():
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
             torch.cuda.reset_max_memory_allocated()
     @staticmethod
     def remove_input_files(file_paths: List[str]):
         if not file_paths:
             return
@@ -451,9 +504,25 @@ class WhisperBase(ABC):
         whisper_params: WhisperValues,
         add_timestamp: bool
     ):
         cached_params = load_yaml(DEFAULT_PARAMETERS_CONFIG_PATH)
         cached_whisper_param = whisper_params.to_yaml()
         cached_yaml = {**cached_params, **cached_whisper_param}
         cached_yaml["whisper"]["add_timestamp"] = add_timestamp
         save_yaml(cached_yaml, DEFAULT_PARAMETERS_CONFIG_PATH)

 import torch
 import whisper
 import gradio as gr
+import torchaudio
 from abc import ABC, abstractmethod
 from typing import BinaryIO, Union, Tuple, List
 import numpy as np
 from faster_whisper.vad import VadOptions
 from dataclasses import astuple
+from modules.uvr.music_separator import MusicSeparator
+from modules.utils.paths import (WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR, DEFAULT_PARAMETERS_CONFIG_PATH,
+                                 UVR_MODELS_DIR)
 from modules.utils.subtitle_manager import get_srt, get_vtt, get_txt, write_file, safe_filename
 from modules.utils.youtube_manager import get_ytdata, get_ytaudio
 from modules.utils.files_manager import get_media_files, format_gradio_files, load_yaml, save_yaml
     def __init__(self,
                  model_dir: str = WHISPER_MODELS_DIR,
                  diarization_model_dir: str = DIARIZATION_MODELS_DIR,
+                 uvr_model_dir: str = UVR_MODELS_DIR,
                  output_dir: str = OUTPUT_DIR,
                  ):
         self.model_dir = model_dir
             model_dir=diarization_model_dir
         )
         self.vad = SileroVAD()
+        self.music_separator = MusicSeparator(
+            model_dir=uvr_model_dir,
+            output_dir=os.path.join(output_dir, "UVR")
+        )
         self.model = None
         self.current_model_size = None
             language_code_dict = {value: key for key, value in whisper.tokenizer.LANGUAGES.items()}
             params.lang = language_code_dict[params.lang]
+        if params.is_bgm_separate:
+            music, audio, _ = self.music_separator.separate(
+                audio=audio,
+                model_name=params.uvr_model_size,
+                device=params.uvr_device,
+                segment_size=params.uvr_segment_size,
+                save_file=params.uvr_save_file,
+                progress=progress
+            )
+            if audio.ndim >= 2:
+                audio = audio.mean(axis=1)
+                if self.music_separator.audio_info is None:
+                    origin_sample_rate = 16000
+                else:
+                    origin_sample_rate = self.music_separator.audio_info.sample_rate
+                audio = self.resample_audio(audio=audio, original_sample_rate=origin_sample_rate)
+            self.music_separator.offload()
         if params.vad_filter:
             # Explicit value set for float('inf') from gr.Number()
             if params.max_speech_duration_s >= 9999:
     def transcribe_mic(self,
                        mic_audio: str,
                        file_format: str,
+                       add_timestamp: bool,
                        progress=gr.Progress(),
                        *whisper_params,
                        ) -> list:
             Audio file path from gr.Microphone()
         file_format: str
             Subtitle File format to write from gr.Dropdown(). Supported format: [SRT, WebVTT, txt]
+        add_timestamp: bool
+            Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
         progress: gr.Progress
             Indicator to show progress directly in gradio.
         *whisper_params: tuple
             transcribed_segments, time_for_task = self.run(
                 mic_audio,
                 progress,
+                add_timestamp,
                 *whisper_params,
             )
             progress(1, desc="Completed!")
             subtitle, result_file_path = self.generate_and_write_file(
                 file_name="Mic",
                 transcribed_segments=transcribed_segments,
+                add_timestamp=add_timestamp,
                 file_format=file_format,
                 output_dir=self.output_dir
             )
         if torch.cuda.is_available():
             return "cuda"
         elif torch.backends.mps.is_available():
+            if not WhisperBase.is_sparse_api_supported():
+                # Device `SparseMPS` is not supported for now. See : https://github.com/pytorch/pytorch/issues/87886
+                return "cpu"
             return "mps"
         else:
             return "cpu"
+    @staticmethod
+    def is_sparse_api_supported():
+        if not torch.backends.mps.is_available():
+            return False
+        try:
+            device = torch.device("mps")
+            sparse_tensor = torch.sparse_coo_tensor(
+                indices=torch.tensor([[0, 1], [2, 3]]),
+                values=torch.tensor([1, 2]),
+                size=(4, 4),
+                device=device
+            )
+            return True
+        except RuntimeError:
+            return False
     @staticmethod
     def release_cuda_memory():
+        """Release memory"""
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
             torch.cuda.reset_max_memory_allocated()
     @staticmethod
     def remove_input_files(file_paths: List[str]):
+        """Remove gradio cached files"""
         if not file_paths:
             return
         whisper_params: WhisperValues,
         add_timestamp: bool
     ):
+        """cache parameters to the yaml file"""
         cached_params = load_yaml(DEFAULT_PARAMETERS_CONFIG_PATH)
         cached_whisper_param = whisper_params.to_yaml()
         cached_yaml = {**cached_params, **cached_whisper_param}
         cached_yaml["whisper"]["add_timestamp"] = add_timestamp
         save_yaml(cached_yaml, DEFAULT_PARAMETERS_CONFIG_PATH)
+    @staticmethod
+    def resample_audio(audio: Union[str, np.ndarray],
+                       new_sample_rate: int = 16000,
+                       original_sample_rate: Optional[int] = None,) -> np.ndarray:
+        """Resamples audio to 16k sample rate, standard on Whisper model"""
+        if isinstance(audio, str):
+            audio, original_sample_rate = torchaudio.load(audio)
+        else:
+            if original_sample_rate is None:
+                raise ValueError("original_sample_rate must be provided when audio is numpy array.")
+            audio = torch.from_numpy(audio)
+        resampler = torchaudio.transforms.Resample(orig_freq=original_sample_rate, new_freq=new_sample_rate)
+        resampled_audio = resampler(audio).numpy()
+        return resampled_audio

modules/whisper/whisper_factory.py CHANGED Viewed

@@ -2,7 +2,7 @@ from typing import Optional
 import os
 from modules.utils.paths import (FASTER_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR,
-                                 INSANELY_FAST_WHISPER_MODELS_DIR, WHISPER_MODELS_DIR)
 from modules.whisper.faster_whisper_inference import FasterWhisperInference
 from modules.whisper.whisper_Inference import WhisperInference
 from modules.whisper.insanely_fast_whisper_inference import InsanelyFastWhisperInference
@@ -17,6 +17,7 @@ class WhisperFactory:
         faster_whisper_model_dir: str = FASTER_WHISPER_MODELS_DIR,
         insanely_fast_whisper_model_dir: str = INSANELY_FAST_WHISPER_MODELS_DIR,
         diarization_model_dir: str = DIARIZATION_MODELS_DIR,
         output_dir: str = OUTPUT_DIR,
     ) -> "WhisperBase":
         """
@@ -37,6 +38,8 @@ class WhisperFactory:
             Directory path for the Insanely Fast Whisper model.
         diarization_model_dir : str
             Directory path for the diarization model.
         output_dir : str
             Directory path where output files will be saved.
@@ -61,23 +64,27 @@ class WhisperFactory:
             return FasterWhisperInference(
                 model_dir=faster_whisper_model_dir,
                 output_dir=output_dir,
-                diarization_model_dir=diarization_model_dir
             )
         elif whisper_type in whisper_typos:
             return WhisperInference(
                 model_dir=whisper_model_dir,
                 output_dir=output_dir,
-                diarization_model_dir=diarization_model_dir
             )
         elif whisper_type in insanely_fast_whisper_typos:
             return InsanelyFastWhisperInference(
                 model_dir=insanely_fast_whisper_model_dir,
                 output_dir=output_dir,
-                diarization_model_dir=diarization_model_dir
             )
         else:
             return FasterWhisperInference(
                 model_dir=faster_whisper_model_dir,
                 output_dir=output_dir,
-                diarization_model_dir=diarization_model_dir
             )

 import os
 from modules.utils.paths import (FASTER_WHISPER_MODELS_DIR, DIARIZATION_MODELS_DIR, OUTPUT_DIR,
+                                 INSANELY_FAST_WHISPER_MODELS_DIR, WHISPER_MODELS_DIR, UVR_MODELS_DIR)
 from modules.whisper.faster_whisper_inference import FasterWhisperInference
 from modules.whisper.whisper_Inference import WhisperInference
 from modules.whisper.insanely_fast_whisper_inference import InsanelyFastWhisperInference
         faster_whisper_model_dir: str = FASTER_WHISPER_MODELS_DIR,
         insanely_fast_whisper_model_dir: str = INSANELY_FAST_WHISPER_MODELS_DIR,
         diarization_model_dir: str = DIARIZATION_MODELS_DIR,
+        uvr_model_dir: str = UVR_MODELS_DIR,
         output_dir: str = OUTPUT_DIR,
     ) -> "WhisperBase":
         """
             Directory path for the Insanely Fast Whisper model.
         diarization_model_dir : str
             Directory path for the diarization model.
+        uvr_model_dir : str
+            Directory path for the UVR model.
         output_dir : str
             Directory path where output files will be saved.
             return FasterWhisperInference(
                 model_dir=faster_whisper_model_dir,
                 output_dir=output_dir,
+                diarization_model_dir=diarization_model_dir,
+                uvr_model_dir=uvr_model_dir
             )
         elif whisper_type in whisper_typos:
             return WhisperInference(
                 model_dir=whisper_model_dir,
                 output_dir=output_dir,
+                diarization_model_dir=diarization_model_dir,
+                uvr_model_dir=uvr_model_dir
             )
         elif whisper_type in insanely_fast_whisper_typos:
             return InsanelyFastWhisperInference(
                 model_dir=insanely_fast_whisper_model_dir,
                 output_dir=output_dir,
+                diarization_model_dir=diarization_model_dir,
+                uvr_model_dir=uvr_model_dir
             )
         else:
             return FasterWhisperInference(
                 model_dir=faster_whisper_model_dir,
                 output_dir=output_dir,
+                diarization_model_dir=diarization_model_dir,
+                uvr_model_dir=uvr_model_dir
             )

modules/whisper/whisper_parameter.py CHANGED Viewed

@@ -47,6 +47,11 @@ class WhisperParameters:
     hotwords: gr.Textbox
     language_detection_threshold: gr.Number
     language_detection_segments: gr.Number
     """
     A data class for Gradio components of the Whisper Parameters. Use "before" Gradio pre-processing.
     This data class is used to mitigate the key-value problem between Gradio components and function parameters.
@@ -148,61 +153,76 @@ class WhisperParameters:
     diarization_device: gr.Dropdown
         This parameter is related with whisperx. Device to run diarization model
-    length_penalty:
         This parameter is related to faster-whisper. Exponential length penalty constant.
-    repetition_penalty:
         This parameter is related to faster-whisper. Penalty applied to the score of previously generated tokens
         (set > 1 to penalize).
-    no_repeat_ngram_size:
         This parameter is related to faster-whisper. Prevent repetitions of n-grams with this size (set 0 to disable).
-    prefix:
         This parameter is related to faster-whisper. Optional text to provide as a prefix for the first window.
-    suppress_blank:
         This parameter is related to faster-whisper. Suppress blank outputs at the beginning of the sampling.
-    suppress_tokens:
         This parameter is related to faster-whisper. List of token IDs to suppress. -1 will suppress a default set
         of symbols as defined in the model config.json file.
-    max_initial_timestamp:
         This parameter is related to faster-whisper. The initial timestamp cannot be later than this.
-    word_timestamps:
         This parameter is related to faster-whisper. Extract word-level timestamps using the cross-attention pattern
         and dynamic time warping, and include the timestamps for each word in each segment.
-    prepend_punctuations:
         This parameter is related to faster-whisper. If word_timestamps is True, merge these punctuation symbols
         with the next word.
-    append_punctuations:
         This parameter is related to faster-whisper. If word_timestamps is True, merge these punctuation symbols
         with the previous word.
-    max_new_tokens:
         This parameter is related to faster-whisper. Maximum number of new tokens to generate per-chunk. If not set,
         the maximum will be set by the default max_length.
-    chunk_length:
         This parameter is related to faster-whisper. The length of audio segments. If it is not None, it will overwrite the
         default chunk_length of the FeatureExtractor.
-    hallucination_silence_threshold:
         This parameter is related to faster-whisper. When word_timestamps is True, skip silent periods longer than this threshold
         (in seconds) when a possible hallucination is detected.
-    hotwords:
         This parameter is related to faster-whisper. Hotwords/hint phrases to provide the model with. Has no effect if prefix is not None.
-    language_detection_threshold:
         This parameter is related to faster-whisper. If the maximum probability of the language tokens is higher than this value, the language is detected.
-    language_detection_segments:
         This parameter is related to faster-whisper. Number of segments to consider for the language detection.
     """
     def as_list(self) -> list:
@@ -273,6 +293,11 @@ class WhisperValues:
     hotwords: Optional[str]
     language_detection_threshold: Optional[float]
     language_detection_segments: int
     """
     A data class to use Whisper parameters.
     """
@@ -323,6 +348,12 @@ class WhisperValues:
             "diarization": {
                 "is_diarize": self.is_diarize,
                 "hf_token": self.hf_token
-            }
         }
         return data

     hotwords: gr.Textbox
     language_detection_threshold: gr.Number
     language_detection_segments: gr.Number
+    is_bgm_separate: gr.Checkbox
+    uvr_model_size: gr.Dropdown
+    uvr_device: gr.Dropdown
+    uvr_segment_size: gr.Number
+    uvr_save_file: gr.Checkbox
     """
     A data class for Gradio components of the Whisper Parameters. Use "before" Gradio pre-processing.
     This data class is used to mitigate the key-value problem between Gradio components and function parameters.
     diarization_device: gr.Dropdown
         This parameter is related with whisperx. Device to run diarization model
+    length_penalty: gr.Number
         This parameter is related to faster-whisper. Exponential length penalty constant.
+    repetition_penalty: gr.Number
         This parameter is related to faster-whisper. Penalty applied to the score of previously generated tokens
         (set > 1 to penalize).
+    no_repeat_ngram_size: gr.Number
         This parameter is related to faster-whisper. Prevent repetitions of n-grams with this size (set 0 to disable).
+    prefix: gr.Textbox
         This parameter is related to faster-whisper. Optional text to provide as a prefix for the first window.
+    suppress_blank: gr.Checkbox
         This parameter is related to faster-whisper. Suppress blank outputs at the beginning of the sampling.
+    suppress_tokens: gr.Textbox
         This parameter is related to faster-whisper. List of token IDs to suppress. -1 will suppress a default set
         of symbols as defined in the model config.json file.
+    max_initial_timestamp: gr.Number
         This parameter is related to faster-whisper. The initial timestamp cannot be later than this.
+    word_timestamps: gr.Checkbox
         This parameter is related to faster-whisper. Extract word-level timestamps using the cross-attention pattern
         and dynamic time warping, and include the timestamps for each word in each segment.
+    prepend_punctuations: gr.Textbox
         This parameter is related to faster-whisper. If word_timestamps is True, merge these punctuation symbols
         with the next word.
+    append_punctuations: gr.Textbox
         This parameter is related to faster-whisper. If word_timestamps is True, merge these punctuation symbols
         with the previous word.
+    max_new_tokens: gr.Number
         This parameter is related to faster-whisper. Maximum number of new tokens to generate per-chunk. If not set,
         the maximum will be set by the default max_length.
+    chunk_length: gr.Number
         This parameter is related to faster-whisper. The length of audio segments. If it is not None, it will overwrite the
         default chunk_length of the FeatureExtractor.
+    hallucination_silence_threshold: gr.Number
         This parameter is related to faster-whisper. When word_timestamps is True, skip silent periods longer than this threshold
         (in seconds) when a possible hallucination is detected.
+    hotwords: gr.Textbox
         This parameter is related to faster-whisper. Hotwords/hint phrases to provide the model with. Has no effect if prefix is not None.
+    language_detection_threshold: gr.Number
         This parameter is related to faster-whisper. If the maximum probability of the language tokens is higher than this value, the language is detected.
+    language_detection_segments: gr.Number
         This parameter is related to faster-whisper. Number of segments to consider for the language detection.
+    is_separate_bgm: gr.Checkbox
+        This parameter is related to UVR. Boolean value that determines whether to separate bgm or not.
+    uvr_model_size: gr.Dropdown
+        This parameter is related to UVR. UVR model size.
+    uvr_device: gr.Dropdown
+        This parameter is related to UVR. Device to run UVR model.
+    uvr_segment_size: gr.Number
+        This parameter is related to UVR. Segment size for UVR model.
+    uvr_save_file: gr.Checkbox
+        This parameter is related to UVR. Boolean value that determines whether to save the file or not.
     """
     def as_list(self) -> list:
     hotwords: Optional[str]
     language_detection_threshold: Optional[float]
     language_detection_segments: int
+    is_bgm_separate: bool
+    uvr_model_size: str
+    uvr_device: str
+    uvr_segment_size: int
+    uvr_save_file: bool
     """
     A data class to use Whisper parameters.
     """
             "diarization": {
                 "is_diarize": self.is_diarize,
                 "hf_token": self.hf_token
+            },
+            "bgm_separation": {
+                "is_separate_bgm": self.is_bgm_separate,
+                "model_size": self.uvr_model_size,
+                "segment_size": self.uvr_segment_size,
+                "save_file": self.uvr_save_file,
+            },
         }
         return data

notebook/whisper-webui.ipynb CHANGED Viewed

@@ -58,7 +58,8 @@
         "# Temporal bug fix from https://github.com/jhj0517/Whisper-WebUI/issues/256\n",
         "!pip install git+https://github.com/JuanBindez/pytubefix.git\n",
         "!pip install tokenizers==0.19.1\n",
-        "!pip install pyannote.audio==3.3.1"
       ]
     },
     {
@@ -96,7 +97,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {
         "id": "PQroYRRZzQiN",
         "cellView": "form"

         "# Temporal bug fix from https://github.com/jhj0517/Whisper-WebUI/issues/256\n",
         "!pip install git+https://github.com/JuanBindez/pytubefix.git\n",
         "!pip install tokenizers==0.19.1\n",
+        "!pip install pyannote.audio==3.3.1\n",
+        "!pip install git+https://github.com/jhj0517/ultimatevocalremover_api.git"
       ]
     },
     {
     },
     {
       "cell_type": "code",
+      "execution_count": 3,
       "metadata": {
         "id": "PQroYRRZzQiN",
         "cellView": "form"

requirements.txt CHANGED Viewed

@@ -12,4 +12,6 @@ transformers==4.42.3
 gradio==4.43.0
 pytubefix
 ruamel.yaml==0.18.6
-pyannote.audio==3.3.1

 gradio==4.43.0
 pytubefix
 ruamel.yaml==0.18.6
+pyannote.audio==3.3.1
+git+https://github.com/jhj0517/ultimatevocalremover_api.git
+git+https://github.com/jhj0517/pyrubberband.git