Spaces:

jhj0517
/

Whisper-WebUI

Running

App Files Files Community

jhj0517 commited on Sep 22, 2024

Commit

b398bd3

unverified ·

2 Parent(s): 633c360 098522f

Merge pull request #286 from jhj0517/refactor/remove-duplicates

Browse files

Files changed (4) hide show

app.py +5 -7
configs/default_parameters.yaml +1 -2
modules/whisper/insanely_fast_whisper_inference.py +1 -1
modules/whisper/whisper_parameter.py +47 -54

app.py CHANGED Viewed

@@ -88,6 +88,9 @@ class App:
             nb_compression_ratio_threshold = gr.Number(label="Compression Ratio Threshold", value=whisper_params["compression_ratio_threshold"],
                                                        interactive=True,
                                                        info="If the gzip compression ratio is above this value, treat as failed.")
             with gr.Group(visible=isinstance(self.whisper_inf, FasterWhisperInference)):
                 nb_length_penalty = gr.Number(label="Length Penalty", value=whisper_params["length_penalty"],
                                               info="Exponential length penalty constant.")
@@ -113,9 +116,6 @@ class App:
                 nb_max_new_tokens = gr.Number(label="Max New Tokens", value=lambda: whisper_params["max_new_tokens"],
                                               precision=0,
                                               info="Maximum number of new tokens to generate per-chunk. If not set, the maximum will be set by the default max_length.")
-                nb_chunk_length = gr.Number(label="Chunk Length", value=lambda: whisper_params["chunk_length"],
-                                            precision=0,
-                                            info="The length of audio segments. If it is not None, it will overwrite the default chunk_length of the FeatureExtractor.")
                 nb_hallucination_silence_threshold = gr.Number(label="Hallucination Silence Threshold (sec)",
                                                                value=lambda: whisper_params["hallucination_silence_threshold"],
                                                                info="When 'Word Timestamps' is True, skip silent periods longer than this threshold (in seconds) when a possible hallucination is detected.")
@@ -127,8 +127,6 @@ class App:
                                                            precision=0,
                                                            info="Number of segments to consider for the language detection.")
             with gr.Group(visible=isinstance(self.whisper_inf, InsanelyFastWhisperInference)):
-                nb_chunk_length_s = gr.Number(label="Chunk Lengths (sec)", value=whisper_params["chunk_length_s"],
-                                              precision=0)
                 nb_batch_size = gr.Number(label="Batch Size", value=whisper_params["batch_size"], precision=0)
         with gr.Accordion("BGM Separation", open=False):
@@ -177,13 +175,13 @@ class App:
                 temperature=sd_temperature, compression_ratio_threshold=nb_compression_ratio_threshold,
                 vad_filter=cb_vad_filter, threshold=sd_threshold, min_speech_duration_ms=nb_min_speech_duration_ms,
                 max_speech_duration_s=nb_max_speech_duration_s, min_silence_duration_ms=nb_min_silence_duration_ms,
-                speech_pad_ms=nb_speech_pad_ms, chunk_length_s=nb_chunk_length_s, batch_size=nb_batch_size,
                 is_diarize=cb_diarize, hf_token=tb_hf_token, diarization_device=dd_diarization_device,
                 length_penalty=nb_length_penalty, repetition_penalty=nb_repetition_penalty,
                 no_repeat_ngram_size=nb_no_repeat_ngram_size, prefix=tb_prefix, suppress_blank=cb_suppress_blank,
                 suppress_tokens=tb_suppress_tokens, max_initial_timestamp=nb_max_initial_timestamp,
                 word_timestamps=cb_word_timestamps, prepend_punctuations=tb_prepend_punctuations,
-                append_punctuations=tb_append_punctuations, max_new_tokens=nb_max_new_tokens, chunk_length=nb_chunk_length,
                 hallucination_silence_threshold=nb_hallucination_silence_threshold, hotwords=tb_hotwords,
                 language_detection_threshold=nb_language_detection_threshold,
                 language_detection_segments=nb_language_detection_segments,

             nb_compression_ratio_threshold = gr.Number(label="Compression Ratio Threshold", value=whisper_params["compression_ratio_threshold"],
                                                        interactive=True,
                                                        info="If the gzip compression ratio is above this value, treat as failed.")
+            nb_chunk_length = gr.Number(label="Chunk Length (s)", value=lambda: whisper_params["chunk_length"],
+                                        precision=0,
+                                        info="The length of audio segments. If it is not None, it will overwrite the default chunk_length of the FeatureExtractor.")
             with gr.Group(visible=isinstance(self.whisper_inf, FasterWhisperInference)):
                 nb_length_penalty = gr.Number(label="Length Penalty", value=whisper_params["length_penalty"],
                                               info="Exponential length penalty constant.")
                 nb_max_new_tokens = gr.Number(label="Max New Tokens", value=lambda: whisper_params["max_new_tokens"],
                                               precision=0,
                                               info="Maximum number of new tokens to generate per-chunk. If not set, the maximum will be set by the default max_length.")
                 nb_hallucination_silence_threshold = gr.Number(label="Hallucination Silence Threshold (sec)",
                                                                value=lambda: whisper_params["hallucination_silence_threshold"],
                                                                info="When 'Word Timestamps' is True, skip silent periods longer than this threshold (in seconds) when a possible hallucination is detected.")
                                                            precision=0,
                                                            info="Number of segments to consider for the language detection.")
             with gr.Group(visible=isinstance(self.whisper_inf, InsanelyFastWhisperInference)):
                 nb_batch_size = gr.Number(label="Batch Size", value=whisper_params["batch_size"], precision=0)
         with gr.Accordion("BGM Separation", open=False):
                 temperature=sd_temperature, compression_ratio_threshold=nb_compression_ratio_threshold,
                 vad_filter=cb_vad_filter, threshold=sd_threshold, min_speech_duration_ms=nb_min_speech_duration_ms,
                 max_speech_duration_s=nb_max_speech_duration_s, min_silence_duration_ms=nb_min_silence_duration_ms,
+                speech_pad_ms=nb_speech_pad_ms, chunk_length=nb_chunk_length, batch_size=nb_batch_size,
                 is_diarize=cb_diarize, hf_token=tb_hf_token, diarization_device=dd_diarization_device,
                 length_penalty=nb_length_penalty, repetition_penalty=nb_repetition_penalty,
                 no_repeat_ngram_size=nb_no_repeat_ngram_size, prefix=tb_prefix, suppress_blank=cb_suppress_blank,
                 suppress_tokens=tb_suppress_tokens, max_initial_timestamp=nb_max_initial_timestamp,
                 word_timestamps=cb_word_timestamps, prepend_punctuations=tb_prepend_punctuations,
+                append_punctuations=tb_append_punctuations, max_new_tokens=nb_max_new_tokens,
                 hallucination_silence_threshold=nb_hallucination_silence_threshold, hotwords=tb_hotwords,
                 language_detection_threshold=nb_language_detection_threshold,
                 language_detection_segments=nb_language_detection_segments,

configs/default_parameters.yaml CHANGED Viewed

@@ -12,7 +12,7 @@ whisper:
   initial_prompt: null
   temperature: 0
   compression_ratio_threshold: 2.4
-  chunk_length_s: 30
   batch_size: 24
   length_penalty: 1
   repetition_penalty: 1
@@ -25,7 +25,6 @@ whisper:
   prepend_punctuations: "\"'“¿([{-"
   append_punctuations: "\"'.。,，!！?？:：”)]}、"
   max_new_tokens: null
-  chunk_length: null
   hallucination_silence_threshold: null
   hotwords: null
   language_detection_threshold: null

   initial_prompt: null
   temperature: 0
   compression_ratio_threshold: 2.4
+  chunk_length: 30
   batch_size: 24
   length_penalty: 1
   repetition_penalty: 1
   prepend_punctuations: "\"'“¿([{-"
   append_punctuations: "\"'.。,，!！?？:：”)]}、"
   max_new_tokens: null
   hallucination_silence_threshold: null
   hotwords: null
   language_detection_threshold: null

modules/whisper/insanely_fast_whisper_inference.py CHANGED Viewed

@@ -78,7 +78,7 @@ class InsanelyFastWhisperInference(WhisperBase):
             segments = self.model(
                 inputs=audio,
                 return_timestamps=True,
-                chunk_length_s=params.chunk_length_s,
                 batch_size=params.batch_size,
                 generate_kwargs={
                     "language": params.lang,

             segments = self.model(
                 inputs=audio,
                 return_timestamps=True,
+                chunk_length_s=params.chunk_length,
                 batch_size=params.batch_size,
                 generate_kwargs={
                     "language": params.lang,

modules/whisper/whisper_parameter.py CHANGED Viewed

@@ -26,7 +26,6 @@ class WhisperParameters:
     max_speech_duration_s: gr.Number
     min_silence_duration_ms: gr.Number
     speech_pad_ms: gr.Number
-    chunk_length_s: gr.Number
     batch_size: gr.Number
     is_diarize: gr.Checkbox
     hf_token: gr.Textbox
@@ -136,10 +135,6 @@ class WhisperParameters:
     speech_pad_ms: gr.Number
         This parameter is related with Silero VAD. Final speech chunks are padded by speech_pad_ms each side
-    chunk_length_s: gr.Number
-        This parameter is related with insanely-fast-whisper pipe.
-        Maximum length of each chunk
     batch_size: gr.Number
         This parameter is related with insanely-fast-whisper pipe. Batch size to pass to the pipe
@@ -193,8 +188,8 @@ class WhisperParameters:
         the maximum will be set by the default max_length.
     chunk_length: gr.Number
-        This parameter is related to faster-whisper. The length of audio segments. If it is not None, it will overwrite the
-        default chunk_length of the FeatureExtractor.
     hallucination_silence_threshold: gr.Number
         This parameter is related to faster-whisper. When word_timestamps is True, skip silent periods longer than this threshold
@@ -252,52 +247,51 @@ class WhisperParameters:
 @dataclass
 class WhisperValues:
-    model_size: str
-    lang: str
-    is_translate: bool
-    beam_size: int
-    log_prob_threshold: float
-    no_speech_threshold: float
-    compute_type: str
-    best_of: int
-    patience: float
-    condition_on_previous_text: bool
-    prompt_reset_on_temperature: float
-    initial_prompt: Optional[str]
-    temperature: float
-    compression_ratio_threshold: float
-    vad_filter: bool
-    threshold: float
-    min_speech_duration_ms: int
-    max_speech_duration_s: float
-    min_silence_duration_ms: int
-    speech_pad_ms: int
-    chunk_length_s: int
-    batch_size: int
-    is_diarize: bool
-    hf_token: str
-    diarization_device: str
-    length_penalty: float
-    repetition_penalty: float
-    no_repeat_ngram_size: int
-    prefix: Optional[str]
-    suppress_blank: bool
-    suppress_tokens: Optional[str]
-    max_initial_timestamp: float
-    word_timestamps: bool
-    prepend_punctuations: Optional[str]
-    append_punctuations: Optional[str]
-    max_new_tokens: Optional[int]
-    chunk_length: Optional[int]
-    hallucination_silence_threshold: Optional[float]
-    hotwords: Optional[str]
-    language_detection_threshold: Optional[float]
-    language_detection_segments: int
-    is_bgm_separate: bool
-    uvr_model_size: str
-    uvr_device: str
-    uvr_segment_size: int
-    uvr_save_file: bool
     """
     A data class to use Whisper parameters.
     """
@@ -318,7 +312,6 @@ class WhisperValues:
                 "initial_prompt": None if not self.initial_prompt else self.initial_prompt,
                 "temperature": self.temperature,
                 "compression_ratio_threshold": self.compression_ratio_threshold,
-                "chunk_length_s": None if self.chunk_length_s is None else self.chunk_length_s,
                 "batch_size": self.batch_size,
                 "length_penalty": self.length_penalty,
                 "repetition_penalty": self.repetition_penalty,

     max_speech_duration_s: gr.Number
     min_silence_duration_ms: gr.Number
     speech_pad_ms: gr.Number
     batch_size: gr.Number
     is_diarize: gr.Checkbox
     hf_token: gr.Textbox
     speech_pad_ms: gr.Number
         This parameter is related with Silero VAD. Final speech chunks are padded by speech_pad_ms each side
     batch_size: gr.Number
         This parameter is related with insanely-fast-whisper pipe. Batch size to pass to the pipe
         the maximum will be set by the default max_length.
     chunk_length: gr.Number
+        This parameter is related to faster-whisper and insanely-fast-whisper. The length of audio segments in seconds.
+         If it is not None, it will overwrite the default chunk_length of the FeatureExtractor.
     hallucination_silence_threshold: gr.Number
         This parameter is related to faster-whisper. When word_timestamps is True, skip silent periods longer than this threshold
 @dataclass
 class WhisperValues:
+    model_size: str = "large-v2"
+    lang: Optional[str] = None
+    is_translate: bool = False
+    beam_size: int = 5
+    log_prob_threshold: float = -1.0
+    no_speech_threshold: float = 0.6
+    compute_type: str = "float16"
+    best_of: int = 5
+    patience: float = 1.0
+    condition_on_previous_text: bool = True
+    prompt_reset_on_temperature: float = 0.5
+    initial_prompt: Optional[str] = None
+    temperature: float = 0.0
+    compression_ratio_threshold: float = 2.4
+    vad_filter: bool = False
+    threshold: float = 0.5
+    min_speech_duration_ms: int = 250
+    max_speech_duration_s: float = float("inf")
+    min_silence_duration_ms: int = 2000
+    speech_pad_ms: int = 400
+    batch_size: int = 24
+    is_diarize: bool = False
+    hf_token: str = ""
+    diarization_device: str = "cuda"
+    length_penalty: float = 1.0
+    repetition_penalty: float = 1.0
+    no_repeat_ngram_size: int = 0.0
+    prefix: Optional[str] = None
+    suppress_blank: bool = True
+    suppress_tokens: Optional[str] = "[-1]"
+    max_initial_timestamp: float = 0.0
+    word_timestamps: bool = False
+    prepend_punctuations: Optional[str] = "\"'“¿([{-"
+    append_punctuations: Optional[str] = "\"'.。,，!！?？:：”)]}、"
+    max_new_tokens: Optional[int] = None
+    chunk_length: Optional[int] = 30
+    hallucination_silence_threshold: Optional[float] = None
+    hotwords: Optional[str] = None
+    language_detection_threshold: Optional[float] = None
+    language_detection_segments: int = 1
+    is_bgm_separate: bool = False
+    uvr_model_size: str = "UVR-MDX-NET-Inst_HQ_4"
+    uvr_device: str = "cuda"
+    uvr_segment_size: int = 256
+    uvr_save_file: bool = False
     """
     A data class to use Whisper parameters.
     """
                 "initial_prompt": None if not self.initial_prompt else self.initial_prompt,
                 "temperature": self.temperature,
                 "compression_ratio_threshold": self.compression_ratio_threshold,
                 "batch_size": self.batch_size,
                 "length_penalty": self.length_penalty,
                 "repetition_penalty": self.repetition_penalty,