Spaces:

jhj0517
/

Whisper-WebUI

Running

App Files Files Community

jhj0517 commited on Oct 29, 2024

Commit

ddbe0b6

1 Parent(s): 78d8e18

Apply Segment model to the pipeline

Browse files

Files changed (7) hide show

modules/diarize/diarize_pipeline.py +1 -0
modules/diarize/diarizer.py +16 -8
modules/utils/subtitle_manager.py +11 -0
modules/vad/silero_vad.py +6 -5
modules/whisper/faster_whisper_inference.py +8 -8
modules/whisper/insanely_fast_whisper_inference.py +14 -6
modules/whisper/whisper_Inference.py +24 -17

modules/diarize/diarize_pipeline.py CHANGED Viewed

@@ -44,6 +44,7 @@ class DiarizationPipeline:
 def assign_word_speakers(diarize_df, transcript_result, fill_nearest=False):
     transcript_segments = transcript_result["segments"]
     for seg in transcript_segments:
         # assign speaker to segment (if any)
         diarize_df['intersection'] = np.minimum(diarize_df['end'], seg['end']) - np.maximum(diarize_df['start'],
                                                                                             seg['start'])

 def assign_word_speakers(diarize_df, transcript_result, fill_nearest=False):
     transcript_segments = transcript_result["segments"]
     for seg in transcript_segments:
+        seg = seg.dict()
         # assign speaker to segment (if any)
         diarize_df['intersection'] = np.minimum(diarize_df['end'], seg['end']) - np.maximum(diarize_df['start'],
                                                                                             seg['start'])

modules/diarize/diarizer.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import os
 import torch
-from typing import List, Union, BinaryIO, Optional
 import numpy as np
 import time
 import logging
@@ -8,6 +8,7 @@ import logging
 from modules.utils.paths import DIARIZATION_MODELS_DIR
 from modules.diarize.diarize_pipeline import DiarizationPipeline, assign_word_speakers
 from modules.diarize.audio_loader import load_audio
 class Diarizer:
@@ -23,10 +24,10 @@ class Diarizer:
     def run(self,
             audio: Union[str, BinaryIO, np.ndarray],
-            transcribed_result: List[dict],
             use_auth_token: str,
             device: Optional[str] = None
-            ):
         """
         Diarize transcribed result as a post-processing
@@ -34,7 +35,7 @@ class Diarizer:
         ----------
         audio: Union[str, BinaryIO, np.ndarray]
             Audio input. This can be file path or binary type.
-        transcribed_result: List[dict]
             transcribed result through whisper.
         use_auth_token: str
             Huggingface token with READ permission. This is only needed the first time you download the model.
@@ -44,8 +45,8 @@ class Diarizer:
         Returns
         ----------
-        segments_result: List[dict]
-            list of dicts that includes start, end timestamps and transcribed text
         elapsed_time: float
             elapsed time for running
         """
@@ -68,14 +69,21 @@ class Diarizer:
             {"segments": transcribed_result}
         )
         for segment in diarized_result["segments"]:
             speaker = "None"
             if "speaker" in segment:
                 speaker = segment["speaker"]
-            segment["text"] = speaker + "|" + segment["text"].strip()
         elapsed_time = time.time() - start_time
-        return diarized_result["segments"], elapsed_time
     def update_pipe(self,
                     use_auth_token: str,

 import os
 import torch
+from typing import List, Union, BinaryIO, Optional, Tuple
 import numpy as np
 import time
 import logging
 from modules.utils.paths import DIARIZATION_MODELS_DIR
 from modules.diarize.diarize_pipeline import DiarizationPipeline, assign_word_speakers
 from modules.diarize.audio_loader import load_audio
+from modules.whisper.data_classes import *
 class Diarizer:
     def run(self,
             audio: Union[str, BinaryIO, np.ndarray],
+            transcribed_result: List[Segment],
             use_auth_token: str,
             device: Optional[str] = None
+            ) -> Tuple[List[Segment], float]:
         """
         Diarize transcribed result as a post-processing
         ----------
         audio: Union[str, BinaryIO, np.ndarray]
             Audio input. This can be file path or binary type.
+        transcribed_result: List[Segment]
             transcribed result through whisper.
         use_auth_token: str
             Huggingface token with READ permission. This is only needed the first time you download the model.
         Returns
         ----------
+        segments_result: List[Segment]
+            list of Segment that includes start, end timestamps and transcribed text
         elapsed_time: float
             elapsed time for running
         """
             {"segments": transcribed_result}
         )
+        segments_result = []
         for segment in diarized_result["segments"]:
+            segment = segment.dict()
             speaker = "None"
             if "speaker" in segment:
                 speaker = segment["speaker"]
+            diarized_text = speaker + "|" + segment["text"].strip()
+            segments_result.append(Segment(
+                start=segment["start"],
+                end=segment["end"],
+                text=diarized_text
+            ))
         elapsed_time = time.time() - start_time
+        return segments_result, elapsed_time
     def update_pipe(self,
                     use_auth_token: str,

modules/utils/subtitle_manager.py CHANGED Viewed

@@ -1,5 +1,7 @@
 import re
 def timeformat_srt(time):
     hours = time // 3600
@@ -23,6 +25,9 @@ def write_file(subtitle, output_file):
 def get_srt(segments):
     output = ""
     for i, segment in enumerate(segments):
         output += f"{i + 1}\n"
@@ -34,6 +39,9 @@ def get_srt(segments):
 def get_vtt(segments):
     output = "WEBVTT\n\n"
     for i, segment in enumerate(segments):
         output += f"{timeformat_vtt(segment['start'])} --> {timeformat_vtt(segment['end'])}\n"
@@ -44,6 +52,9 @@ def get_vtt(segments):
 def get_txt(segments):
     output = ""
     for i, segment in enumerate(segments):
         if segment['text'].startswith(' '):

 import re
+from modules.whisper.data_classes import Segment
 def timeformat_srt(time):
     hours = time // 3600
 def get_srt(segments):
+    if segments and isinstance(segments[0], Segment):
+        segments = [seg.dict() for seg in segments]
     output = ""
     for i, segment in enumerate(segments):
         output += f"{i + 1}\n"
 def get_vtt(segments):
+    if segments and isinstance(segments[0], Segment):
+        segments = [seg.dict() for seg in segments]
     output = "WEBVTT\n\n"
     for i, segment in enumerate(segments):
         output += f"{timeformat_vtt(segment['start'])} --> {timeformat_vtt(segment['end'])}\n"
 def get_txt(segments):
+    if segments and isinstance(segments[0], Segment):
+        segments = [seg.dict() for seg in segments]
     output = ""
     for i, segment in enumerate(segments):
         if segment['text'].startswith(' '):

modules/vad/silero_vad.py CHANGED Viewed

@@ -5,7 +5,8 @@ import numpy as np
 from typing import BinaryIO, Union, List, Optional, Tuple
 import warnings
 import faster_whisper
-from faster_whisper.transcribe import SpeechTimestampsMap, Segment
 import gradio as gr
@@ -247,18 +248,18 @@ class SileroVAD:
     def restore_speech_timestamps(
         self,
-        segments: List[dict],
         speech_chunks: List[dict],
         sampling_rate: Optional[int] = None,
-    ) -> List[dict]:
         if sampling_rate is None:
             sampling_rate = self.sampling_rate
         ts_map = SpeechTimestampsMap(speech_chunks, sampling_rate)
         for segment in segments:
-            segment["start"] = ts_map.get_original_time(segment["start"])
-            segment["end"] = ts_map.get_original_time(segment["end"])
         return segments

 from typing import BinaryIO, Union, List, Optional, Tuple
 import warnings
 import faster_whisper
+from modules.whisper.data_classes import *
+from faster_whisper.transcribe import SpeechTimestampsMap
 import gradio as gr
     def restore_speech_timestamps(
         self,
+        segments: List[Segment],
         speech_chunks: List[dict],
         sampling_rate: Optional[int] = None,
+    ) -> List[Segment]:
         if sampling_rate is None:
             sampling_rate = self.sampling_rate
         ts_map = SpeechTimestampsMap(speech_chunks, sampling_rate)
         for segment in segments:
+            segment.start = ts_map.get_original_time(segment.start)
+            segment.start = ts_map.get_original_time(segment.start)
         return segments

modules/whisper/faster_whisper_inference.py CHANGED Viewed

@@ -40,7 +40,7 @@ class FasterWhisperInference(BaseTranscriptionPipeline):
                    audio: Union[str, BinaryIO, np.ndarray],
                    progress: gr.Progress = gr.Progress(),
                    *whisper_params,
-                   ) -> Tuple[List[dict], float]:
         """
         transcribe method for faster-whisper.
@@ -55,8 +55,8 @@ class FasterWhisperInference(BaseTranscriptionPipeline):
         Returns
         ----------
-        segments_result: List[dict]
-            list of dicts that includes start, end timestamps and transcribed text
         elapsed_time: float
             elapsed time for transcription
         """
@@ -102,11 +102,11 @@ class FasterWhisperInference(BaseTranscriptionPipeline):
         segments_result = []
         for segment in segments:
             progress(segment.start / info.duration, desc="Transcribing..")
-            segments_result.append({
-                "start": segment.start,
-                "end": segment.end,
-                "text": segment.text
-            })
         elapsed_time = time.time() - start_time
         return segments_result, elapsed_time

                    audio: Union[str, BinaryIO, np.ndarray],
                    progress: gr.Progress = gr.Progress(),
                    *whisper_params,
+                   ) -> Tuple[List[Segment], float]:
         """
         transcribe method for faster-whisper.
         Returns
         ----------
+        segments_result: List[Segment]
+            list of Segment that includes start, end timestamps and transcribed text
         elapsed_time: float
             elapsed time for transcription
         """
         segments_result = []
         for segment in segments:
             progress(segment.start / info.duration, desc="Transcribing..")
+            segments_result.append(Segment(
+                start=segment.start,
+                end=segment.end,
+                text=segment.text
+            ))
         elapsed_time = time.time() - start_time
         return segments_result, elapsed_time

modules/whisper/insanely_fast_whisper_inference.py CHANGED Viewed

@@ -40,7 +40,7 @@ class InsanelyFastWhisperInference(BaseTranscriptionPipeline):
                    audio: Union[str, np.ndarray, torch.Tensor],
                    progress: gr.Progress = gr.Progress(),
                    *whisper_params,
-                   ) -> Tuple[List[dict], float]:
         """
         transcribe method for faster-whisper.
@@ -55,8 +55,8 @@ class InsanelyFastWhisperInference(BaseTranscriptionPipeline):
         Returns
         ----------
-        segments_result: List[dict]
-            list of dicts that includes start, end timestamps and transcribed text
         elapsed_time: float
             elapsed time for transcription
         """
@@ -95,9 +95,17 @@ class InsanelyFastWhisperInference(BaseTranscriptionPipeline):
                 generate_kwargs=kwargs
             )
-        segments_result = self.format_result(
-            transcribed_result=segments,
-        )
         elapsed_time = time.time() - start_time
         return segments_result, elapsed_time

                    audio: Union[str, np.ndarray, torch.Tensor],
                    progress: gr.Progress = gr.Progress(),
                    *whisper_params,
+                   ) -> Tuple[List[Segment], float]:
         """
         transcribe method for faster-whisper.
         Returns
         ----------
+        segments_result: List[Segment]
+            list of Segment that includes start, end timestamps and transcribed text
         elapsed_time: float
             elapsed time for transcription
         """
                 generate_kwargs=kwargs
             )
+        segments_result = []
+        for item in segments["chunks"]:
+            start, end = item["timestamp"][0], item["timestamp"][1]
+            if end is None:
+                end = start
+            segments_result.append(Segment(
+                text=item["text"],
+                start=start,
+                end=end
+            ))
         elapsed_time = time.time() - start_time
         return segments_result, elapsed_time

modules/whisper/whisper_Inference.py CHANGED Viewed

@@ -30,7 +30,7 @@ class WhisperInference(BaseTranscriptionPipeline):
                    audio: Union[str, np.ndarray, torch.Tensor],
                    progress: gr.Progress = gr.Progress(),
                    *whisper_params,
-                   ) -> Tuple[List[dict], float]:
         """
         transcribe method for faster-whisper.
@@ -45,8 +45,8 @@ class WhisperInference(BaseTranscriptionPipeline):
         Returns
         ----------
-        segments_result: List[dict]
-            list of dicts that includes start, end timestamps and transcribed text
         elapsed_time: float
             elapsed time for transcription
         """
@@ -59,21 +59,28 @@ class WhisperInference(BaseTranscriptionPipeline):
         def progress_callback(progress_value):
             progress(progress_value, desc="Transcribing..")
-        segments_result = self.model.transcribe(audio=audio,
-                                                language=params.lang,
-                                                verbose=False,
-                                                beam_size=params.beam_size,
-                                                logprob_threshold=params.log_prob_threshold,
-                                                no_speech_threshold=params.no_speech_threshold,
-                                                task="translate" if params.is_translate and self.current_model_size in self.translatable_models else "transcribe",
-                                                fp16=True if params.compute_type == "float16" else False,
-                                                best_of=params.best_of,
-                                                patience=params.patience,
-                                                temperature=params.temperature,
-                                                compression_ratio_threshold=params.compression_ratio_threshold,
-                                                progress_callback=progress_callback,)["segments"]
-        elapsed_time = time.time() - start_time
         return segments_result, elapsed_time
     def update_model(self,

                    audio: Union[str, np.ndarray, torch.Tensor],
                    progress: gr.Progress = gr.Progress(),
                    *whisper_params,
+                   ) -> Tuple[List[Segment], float]:
         """
         transcribe method for faster-whisper.
         Returns
         ----------
+        segments_result: List[Segment]
+            list of Segment that includes start, end timestamps and transcribed text
         elapsed_time: float
             elapsed time for transcription
         """
         def progress_callback(progress_value):
             progress(progress_value, desc="Transcribing..")
+        result = self.model.transcribe(audio=audio,
+                                       language=params.lang,
+                                       verbose=False,
+                                       beam_size=params.beam_size,
+                                       logprob_threshold=params.log_prob_threshold,
+                                       no_speech_threshold=params.no_speech_threshold,
+                                       task="translate" if params.is_translate and self.current_model_size in self.translatable_models else "transcribe",
+                                       fp16=True if params.compute_type == "float16" else False,
+                                       best_of=params.best_of,
+                                       patience=params.patience,
+                                       temperature=params.temperature,
+                                       compression_ratio_threshold=params.compression_ratio_threshold,
+                                       progress_callback=progress_callback,)["segments"]
+        segments_result = []
+        for segment in result:
+            segments_result.append(Segment(
+                start=segment["start"],
+                end=segment["end"],
+                text=segment["text"]
+            ))
+        elapsed_time = time.time() - start_time
         return segments_result, elapsed_time
     def update_model(self,