split whisper_online.py into smaller files
Browse files- README.md +2 -2
- src/demo.png +0 -0
- src/{live_transcription.html → web/live_transcription.html} +0 -0
- src/whisper_streaming/backends.py +368 -0
- src/whisper_streaming/online_asr.py +401 -0
- whisper_fastapi_online_server.py +1 -1
- whisper_online.py +6 -770
README.md
CHANGED
|
@@ -12,12 +12,12 @@ This project extends the [Whisper Streaming](https://github.com/ufal/whisper_str
|
|
| 12 |
|
| 13 |
5. **MLX Whisper backend**: Integrates the alternative backend option MLX Whisper, optimized for efficient speech recognition on Apple silicon.
|
| 14 |
|
| 15 |
-

|
| 16 |
|
| 17 |
## Code Origins
|
| 18 |
|
| 19 |
This project reuses and extends code from the original Whisper Streaming repository:
|
| 20 |
-
- whisper_online.py: Contains code from whisper_streaming
|
| 21 |
- silero_vad_iterator.py: Originally from the Silero VAD repository, included in the whisper_streaming project.
|
| 22 |
|
| 23 |
## Installation
|
|
|
|
| 12 |
|
| 13 |
5. **MLX Whisper backend**: Integrates the alternative backend option MLX Whisper, optimized for efficient speech recognition on Apple silicon.
|
| 14 |
|
| 15 |
+

|
| 16 |
|
| 17 |
## Code Origins
|
| 18 |
|
| 19 |
This project reuses and extends code from the original Whisper Streaming repository:
|
| 20 |
+
- whisper_online.py, backends.py and online_asr.py: Contains code from whisper_streaming
|
| 21 |
- silero_vad_iterator.py: Originally from the Silero VAD repository, included in the whisper_streaming project.
|
| 22 |
|
| 23 |
## Installation
|
src/demo.png
DELETED
|
Binary file (82.6 kB)
|
|
|
src/{live_transcription.html → web/live_transcription.html}
RENAMED
|
File without changes
|
src/whisper_streaming/backends.py
ADDED
|
@@ -0,0 +1,368 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
import logging
|
| 3 |
+
|
| 4 |
+
import io
|
| 5 |
+
import soundfile as sf
|
| 6 |
+
import math
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
logger = logging.getLogger(__name__)
|
| 10 |
+
|
| 11 |
+
class ASRBase:
|
| 12 |
+
sep = " " # join transcribe words with this character (" " for whisper_timestamped,
|
| 13 |
+
# "" for faster-whisper because it emits the spaces when neeeded)
|
| 14 |
+
|
| 15 |
+
def __init__(
|
| 16 |
+
self, lan, modelsize=None, cache_dir=None, model_dir=None, logfile=sys.stderr
|
| 17 |
+
):
|
| 18 |
+
self.logfile = logfile
|
| 19 |
+
|
| 20 |
+
self.transcribe_kargs = {}
|
| 21 |
+
if lan == "auto":
|
| 22 |
+
self.original_language = None
|
| 23 |
+
else:
|
| 24 |
+
self.original_language = lan
|
| 25 |
+
|
| 26 |
+
self.model = self.load_model(modelsize, cache_dir, model_dir)
|
| 27 |
+
|
| 28 |
+
def load_model(self, modelsize, cache_dir):
|
| 29 |
+
raise NotImplemented("must be implemented in the child class")
|
| 30 |
+
|
| 31 |
+
def transcribe(self, audio, init_prompt=""):
|
| 32 |
+
raise NotImplemented("must be implemented in the child class")
|
| 33 |
+
|
| 34 |
+
def use_vad(self):
|
| 35 |
+
raise NotImplemented("must be implemented in the child class")
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
class WhisperTimestampedASR(ASRBase):
|
| 39 |
+
"""Uses whisper_timestamped library as the backend. Initially, we tested the code on this backend. It worked, but slower than faster-whisper.
|
| 40 |
+
On the other hand, the installation for GPU could be easier.
|
| 41 |
+
"""
|
| 42 |
+
|
| 43 |
+
sep = " "
|
| 44 |
+
|
| 45 |
+
def load_model(self, modelsize=None, cache_dir=None, model_dir=None):
|
| 46 |
+
import whisper
|
| 47 |
+
import whisper_timestamped
|
| 48 |
+
from whisper_timestamped import transcribe_timestamped
|
| 49 |
+
|
| 50 |
+
self.transcribe_timestamped = transcribe_timestamped
|
| 51 |
+
if model_dir is not None:
|
| 52 |
+
logger.debug("ignoring model_dir, not implemented")
|
| 53 |
+
return whisper.load_model(modelsize, download_root=cache_dir)
|
| 54 |
+
|
| 55 |
+
def transcribe(self, audio, init_prompt=""):
|
| 56 |
+
result = self.transcribe_timestamped(
|
| 57 |
+
self.model,
|
| 58 |
+
audio,
|
| 59 |
+
language=self.original_language,
|
| 60 |
+
initial_prompt=init_prompt,
|
| 61 |
+
verbose=None,
|
| 62 |
+
condition_on_previous_text=True,
|
| 63 |
+
**self.transcribe_kargs,
|
| 64 |
+
)
|
| 65 |
+
return result
|
| 66 |
+
|
| 67 |
+
def ts_words(self, r):
|
| 68 |
+
# return: transcribe result object to [(beg,end,"word1"), ...]
|
| 69 |
+
o = []
|
| 70 |
+
for s in r["segments"]:
|
| 71 |
+
for w in s["words"]:
|
| 72 |
+
t = (w["start"], w["end"], w["text"])
|
| 73 |
+
o.append(t)
|
| 74 |
+
return o
|
| 75 |
+
|
| 76 |
+
def segments_end_ts(self, res):
|
| 77 |
+
return [s["end"] for s in res["segments"]]
|
| 78 |
+
|
| 79 |
+
def use_vad(self):
|
| 80 |
+
self.transcribe_kargs["vad"] = True
|
| 81 |
+
|
| 82 |
+
def set_translate_task(self):
|
| 83 |
+
self.transcribe_kargs["task"] = "translate"
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
class FasterWhisperASR(ASRBase):
|
| 87 |
+
"""Uses faster-whisper library as the backend. Works much faster, appx 4-times (in offline mode). For GPU, it requires installation with a specific CUDNN version."""
|
| 88 |
+
|
| 89 |
+
sep = ""
|
| 90 |
+
|
| 91 |
+
def load_model(self, modelsize=None, cache_dir=None, model_dir=None):
|
| 92 |
+
from faster_whisper import WhisperModel
|
| 93 |
+
|
| 94 |
+
# logging.getLogger("faster_whisper").setLevel(logger.level)
|
| 95 |
+
if model_dir is not None:
|
| 96 |
+
logger.debug(
|
| 97 |
+
f"Loading whisper model from model_dir {model_dir}. modelsize and cache_dir parameters are not used."
|
| 98 |
+
)
|
| 99 |
+
model_size_or_path = model_dir
|
| 100 |
+
elif modelsize is not None:
|
| 101 |
+
model_size_or_path = modelsize
|
| 102 |
+
else:
|
| 103 |
+
raise ValueError("modelsize or model_dir parameter must be set")
|
| 104 |
+
|
| 105 |
+
# this worked fast and reliably on NVIDIA L40
|
| 106 |
+
model = WhisperModel(
|
| 107 |
+
model_size_or_path,
|
| 108 |
+
device="cuda",
|
| 109 |
+
compute_type="float16",
|
| 110 |
+
download_root=cache_dir,
|
| 111 |
+
)
|
| 112 |
+
|
| 113 |
+
# or run on GPU with INT8
|
| 114 |
+
# tested: the transcripts were different, probably worse than with FP16, and it was slightly (appx 20%) slower
|
| 115 |
+
# model = WhisperModel(model_size, device="cuda", compute_type="int8_float16")
|
| 116 |
+
|
| 117 |
+
# or run on CPU with INT8
|
| 118 |
+
# tested: works, but slow, appx 10-times than cuda FP16
|
| 119 |
+
# model = WhisperModel(modelsize, device="cpu", compute_type="int8") #, download_root="faster-disk-cache-dir/")
|
| 120 |
+
return model
|
| 121 |
+
|
| 122 |
+
def transcribe(self, audio, init_prompt=""):
|
| 123 |
+
|
| 124 |
+
# tested: beam_size=5 is faster and better than 1 (on one 200 second document from En ESIC, min chunk 0.01)
|
| 125 |
+
segments, info = self.model.transcribe(
|
| 126 |
+
audio,
|
| 127 |
+
language=self.original_language,
|
| 128 |
+
initial_prompt=init_prompt,
|
| 129 |
+
beam_size=5,
|
| 130 |
+
word_timestamps=True,
|
| 131 |
+
condition_on_previous_text=True,
|
| 132 |
+
**self.transcribe_kargs,
|
| 133 |
+
)
|
| 134 |
+
# print(info) # info contains language detection result
|
| 135 |
+
|
| 136 |
+
return list(segments)
|
| 137 |
+
|
| 138 |
+
def ts_words(self, segments):
|
| 139 |
+
o = []
|
| 140 |
+
for segment in segments:
|
| 141 |
+
for word in segment.words:
|
| 142 |
+
if segment.no_speech_prob > 0.9:
|
| 143 |
+
continue
|
| 144 |
+
# not stripping the spaces -- should not be merged with them!
|
| 145 |
+
w = word.word
|
| 146 |
+
t = (word.start, word.end, w)
|
| 147 |
+
o.append(t)
|
| 148 |
+
return o
|
| 149 |
+
|
| 150 |
+
def segments_end_ts(self, res):
|
| 151 |
+
return [s.end for s in res]
|
| 152 |
+
|
| 153 |
+
def use_vad(self):
|
| 154 |
+
self.transcribe_kargs["vad_filter"] = True
|
| 155 |
+
|
| 156 |
+
def set_translate_task(self):
|
| 157 |
+
self.transcribe_kargs["task"] = "translate"
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
class MLXWhisper(ASRBase):
|
| 161 |
+
"""
|
| 162 |
+
Uses MPX Whisper library as the backend, optimized for Apple Silicon.
|
| 163 |
+
Models available: https://huggingface.co/collections/mlx-community/whisper-663256f9964fbb1177db93dc
|
| 164 |
+
Significantly faster than faster-whisper (without CUDA) on Apple M1.
|
| 165 |
+
"""
|
| 166 |
+
|
| 167 |
+
sep = " "
|
| 168 |
+
|
| 169 |
+
def load_model(self, modelsize=None, cache_dir=None, model_dir=None):
|
| 170 |
+
"""
|
| 171 |
+
Loads the MLX-compatible Whisper model.
|
| 172 |
+
|
| 173 |
+
Args:
|
| 174 |
+
modelsize (str, optional): The size or name of the Whisper model to load.
|
| 175 |
+
If provided, it will be translated to an MLX-compatible model path using the `translate_model_name` method.
|
| 176 |
+
Example: "large-v3-turbo" -> "mlx-community/whisper-large-v3-turbo".
|
| 177 |
+
cache_dir (str, optional): Path to the directory for caching models.
|
| 178 |
+
**Note**: This is not supported by MLX Whisper and will be ignored.
|
| 179 |
+
model_dir (str, optional): Direct path to a custom model directory.
|
| 180 |
+
If specified, it overrides the `modelsize` parameter.
|
| 181 |
+
"""
|
| 182 |
+
from mlx_whisper.transcribe import ModelHolder, transcribe
|
| 183 |
+
import mlx.core as mx
|
| 184 |
+
|
| 185 |
+
if model_dir is not None:
|
| 186 |
+
logger.debug(
|
| 187 |
+
f"Loading whisper model from model_dir {model_dir}. modelsize parameter is not used."
|
| 188 |
+
)
|
| 189 |
+
model_size_or_path = model_dir
|
| 190 |
+
elif modelsize is not None:
|
| 191 |
+
model_size_or_path = self.translate_model_name(modelsize)
|
| 192 |
+
logger.debug(
|
| 193 |
+
f"Loading whisper model {modelsize}. You use mlx whisper, so {model_size_or_path} will be used."
|
| 194 |
+
)
|
| 195 |
+
|
| 196 |
+
self.model_size_or_path = model_size_or_path
|
| 197 |
+
|
| 198 |
+
# In mlx_whisper.transcribe, dtype is defined as:
|
| 199 |
+
# dtype = mx.float16 if decode_options.get("fp16", True) else mx.float32
|
| 200 |
+
# Since we do not use decode_options in self.transcribe, we will set dtype to mx.float16
|
| 201 |
+
dtype = mx.float16
|
| 202 |
+
ModelHolder.get_model(model_size_or_path, dtype)
|
| 203 |
+
return transcribe
|
| 204 |
+
|
| 205 |
+
def translate_model_name(self, model_name):
|
| 206 |
+
"""
|
| 207 |
+
Translates a given model name to its corresponding MLX-compatible model path.
|
| 208 |
+
|
| 209 |
+
Args:
|
| 210 |
+
model_name (str): The name of the model to translate.
|
| 211 |
+
|
| 212 |
+
Returns:
|
| 213 |
+
str: The MLX-compatible model path.
|
| 214 |
+
"""
|
| 215 |
+
# Dictionary mapping model names to MLX-compatible paths
|
| 216 |
+
model_mapping = {
|
| 217 |
+
"tiny.en": "mlx-community/whisper-tiny.en-mlx",
|
| 218 |
+
"tiny": "mlx-community/whisper-tiny-mlx",
|
| 219 |
+
"base.en": "mlx-community/whisper-base.en-mlx",
|
| 220 |
+
"base": "mlx-community/whisper-base-mlx",
|
| 221 |
+
"small.en": "mlx-community/whisper-small.en-mlx",
|
| 222 |
+
"small": "mlx-community/whisper-small-mlx",
|
| 223 |
+
"medium.en": "mlx-community/whisper-medium.en-mlx",
|
| 224 |
+
"medium": "mlx-community/whisper-medium-mlx",
|
| 225 |
+
"large-v1": "mlx-community/whisper-large-v1-mlx",
|
| 226 |
+
"large-v2": "mlx-community/whisper-large-v2-mlx",
|
| 227 |
+
"large-v3": "mlx-community/whisper-large-v3-mlx",
|
| 228 |
+
"large-v3-turbo": "mlx-community/whisper-large-v3-turbo",
|
| 229 |
+
"large": "mlx-community/whisper-large-mlx",
|
| 230 |
+
}
|
| 231 |
+
|
| 232 |
+
# Retrieve the corresponding MLX model path
|
| 233 |
+
mlx_model_path = model_mapping.get(model_name)
|
| 234 |
+
|
| 235 |
+
if mlx_model_path:
|
| 236 |
+
return mlx_model_path
|
| 237 |
+
else:
|
| 238 |
+
raise ValueError(
|
| 239 |
+
f"Model name '{model_name}' is not recognized or not supported."
|
| 240 |
+
)
|
| 241 |
+
|
| 242 |
+
def transcribe(self, audio, init_prompt=""):
|
| 243 |
+
if self.transcribe_kargs:
|
| 244 |
+
logger.warning("Transcribe kwargs (vad, task) are not compatible with MLX Whisper and will be ignored.")
|
| 245 |
+
segments = self.model(
|
| 246 |
+
audio,
|
| 247 |
+
language=self.original_language,
|
| 248 |
+
initial_prompt=init_prompt,
|
| 249 |
+
word_timestamps=True,
|
| 250 |
+
condition_on_previous_text=True,
|
| 251 |
+
path_or_hf_repo=self.model_size_or_path,
|
| 252 |
+
)
|
| 253 |
+
return segments.get("segments", [])
|
| 254 |
+
|
| 255 |
+
def ts_words(self, segments):
|
| 256 |
+
"""
|
| 257 |
+
Extract timestamped words from transcription segments and skips words with high no-speech probability.
|
| 258 |
+
"""
|
| 259 |
+
return [
|
| 260 |
+
(word["start"], word["end"], word["word"])
|
| 261 |
+
for segment in segments
|
| 262 |
+
for word in segment.get("words", [])
|
| 263 |
+
if segment.get("no_speech_prob", 0) <= 0.9
|
| 264 |
+
]
|
| 265 |
+
|
| 266 |
+
def segments_end_ts(self, res):
|
| 267 |
+
return [s["end"] for s in res]
|
| 268 |
+
|
| 269 |
+
def use_vad(self):
|
| 270 |
+
self.transcribe_kargs["vad_filter"] = True
|
| 271 |
+
|
| 272 |
+
def set_translate_task(self):
|
| 273 |
+
self.transcribe_kargs["task"] = "translate"
|
| 274 |
+
|
| 275 |
+
|
| 276 |
+
class OpenaiApiASR(ASRBase):
|
| 277 |
+
"""Uses OpenAI's Whisper API for audio transcription."""
|
| 278 |
+
|
| 279 |
+
def __init__(self, lan=None, temperature=0, logfile=sys.stderr):
|
| 280 |
+
self.logfile = logfile
|
| 281 |
+
|
| 282 |
+
self.modelname = "whisper-1"
|
| 283 |
+
self.original_language = (
|
| 284 |
+
None if lan == "auto" else lan
|
| 285 |
+
) # ISO-639-1 language code
|
| 286 |
+
self.response_format = "verbose_json"
|
| 287 |
+
self.temperature = temperature
|
| 288 |
+
|
| 289 |
+
self.load_model()
|
| 290 |
+
|
| 291 |
+
self.use_vad_opt = False
|
| 292 |
+
|
| 293 |
+
# reset the task in set_translate_task
|
| 294 |
+
self.task = "transcribe"
|
| 295 |
+
|
| 296 |
+
def load_model(self, *args, **kwargs):
|
| 297 |
+
from openai import OpenAI
|
| 298 |
+
|
| 299 |
+
self.client = OpenAI()
|
| 300 |
+
|
| 301 |
+
self.transcribed_seconds = (
|
| 302 |
+
0 # for logging how many seconds were processed by API, to know the cost
|
| 303 |
+
)
|
| 304 |
+
|
| 305 |
+
def ts_words(self, segments):
|
| 306 |
+
no_speech_segments = []
|
| 307 |
+
if self.use_vad_opt:
|
| 308 |
+
for segment in segments.segments:
|
| 309 |
+
# TODO: threshold can be set from outside
|
| 310 |
+
if segment["no_speech_prob"] > 0.8:
|
| 311 |
+
no_speech_segments.append(
|
| 312 |
+
(segment.get("start"), segment.get("end"))
|
| 313 |
+
)
|
| 314 |
+
|
| 315 |
+
o = []
|
| 316 |
+
for word in segments.words:
|
| 317 |
+
start = word.start
|
| 318 |
+
end = word.end
|
| 319 |
+
if any(s[0] <= start <= s[1] for s in no_speech_segments):
|
| 320 |
+
# print("Skipping word", word.get("word"), "because it's in a no-speech segment")
|
| 321 |
+
continue
|
| 322 |
+
o.append((start, end, word.word))
|
| 323 |
+
return o
|
| 324 |
+
|
| 325 |
+
def segments_end_ts(self, res):
|
| 326 |
+
return [s.end for s in res.words]
|
| 327 |
+
|
| 328 |
+
def transcribe(self, audio_data, prompt=None, *args, **kwargs):
|
| 329 |
+
# Write the audio data to a buffer
|
| 330 |
+
buffer = io.BytesIO()
|
| 331 |
+
buffer.name = "temp.wav"
|
| 332 |
+
sf.write(buffer, audio_data, samplerate=16000, format="WAV", subtype="PCM_16")
|
| 333 |
+
buffer.seek(0) # Reset buffer's position to the beginning
|
| 334 |
+
|
| 335 |
+
self.transcribed_seconds += math.ceil(
|
| 336 |
+
len(audio_data) / 16000
|
| 337 |
+
) # it rounds up to the whole seconds
|
| 338 |
+
|
| 339 |
+
params = {
|
| 340 |
+
"model": self.modelname,
|
| 341 |
+
"file": buffer,
|
| 342 |
+
"response_format": self.response_format,
|
| 343 |
+
"temperature": self.temperature,
|
| 344 |
+
"timestamp_granularities": ["word", "segment"],
|
| 345 |
+
}
|
| 346 |
+
if self.task != "translate" and self.original_language:
|
| 347 |
+
params["language"] = self.original_language
|
| 348 |
+
if prompt:
|
| 349 |
+
params["prompt"] = prompt
|
| 350 |
+
|
| 351 |
+
if self.task == "translate":
|
| 352 |
+
proc = self.client.audio.translations
|
| 353 |
+
else:
|
| 354 |
+
proc = self.client.audio.transcriptions
|
| 355 |
+
|
| 356 |
+
# Process transcription/translation
|
| 357 |
+
transcript = proc.create(**params)
|
| 358 |
+
logger.debug(
|
| 359 |
+
f"OpenAI API processed accumulated {self.transcribed_seconds} seconds"
|
| 360 |
+
)
|
| 361 |
+
|
| 362 |
+
return transcript
|
| 363 |
+
|
| 364 |
+
def use_vad(self):
|
| 365 |
+
self.use_vad_opt = True
|
| 366 |
+
|
| 367 |
+
def set_translate_task(self):
|
| 368 |
+
self.task = "translate"
|
src/whisper_streaming/online_asr.py
ADDED
|
@@ -0,0 +1,401 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
import numpy as np
|
| 3 |
+
import logging
|
| 4 |
+
|
| 5 |
+
logger = logging.getLogger(__name__)
|
| 6 |
+
|
| 7 |
+
class HypothesisBuffer:
|
| 8 |
+
|
| 9 |
+
def __init__(self, logfile=sys.stderr):
|
| 10 |
+
self.commited_in_buffer = []
|
| 11 |
+
self.buffer = []
|
| 12 |
+
self.new = []
|
| 13 |
+
|
| 14 |
+
self.last_commited_time = 0
|
| 15 |
+
self.last_commited_word = None
|
| 16 |
+
|
| 17 |
+
self.logfile = logfile
|
| 18 |
+
|
| 19 |
+
def insert(self, new, offset):
|
| 20 |
+
# compare self.commited_in_buffer and new. It inserts only the words in new that extend the commited_in_buffer, it means they are roughly behind last_commited_time and new in content
|
| 21 |
+
# the new tail is added to self.new
|
| 22 |
+
|
| 23 |
+
new = [(a + offset, b + offset, t) for a, b, t in new]
|
| 24 |
+
self.new = [(a, b, t) for a, b, t in new if a > self.last_commited_time - 0.1]
|
| 25 |
+
|
| 26 |
+
if len(self.new) >= 1:
|
| 27 |
+
a, b, t = self.new[0]
|
| 28 |
+
if abs(a - self.last_commited_time) < 1:
|
| 29 |
+
if self.commited_in_buffer:
|
| 30 |
+
# it's going to search for 1, 2, ..., 5 consecutive words (n-grams) that are identical in commited and new. If they are, they're dropped.
|
| 31 |
+
cn = len(self.commited_in_buffer)
|
| 32 |
+
nn = len(self.new)
|
| 33 |
+
for i in range(1, min(min(cn, nn), 5) + 1): # 5 is the maximum
|
| 34 |
+
c = " ".join(
|
| 35 |
+
[self.commited_in_buffer[-j][2] for j in range(1, i + 1)][
|
| 36 |
+
::-1
|
| 37 |
+
]
|
| 38 |
+
)
|
| 39 |
+
tail = " ".join(self.new[j - 1][2] for j in range(1, i + 1))
|
| 40 |
+
if c == tail:
|
| 41 |
+
words = []
|
| 42 |
+
for j in range(i):
|
| 43 |
+
words.append(repr(self.new.pop(0)))
|
| 44 |
+
words_msg = " ".join(words)
|
| 45 |
+
logger.debug(f"removing last {i} words: {words_msg}")
|
| 46 |
+
break
|
| 47 |
+
|
| 48 |
+
def flush(self):
|
| 49 |
+
# returns commited chunk = the longest common prefix of 2 last inserts.
|
| 50 |
+
|
| 51 |
+
commit = []
|
| 52 |
+
while self.new:
|
| 53 |
+
na, nb, nt = self.new[0]
|
| 54 |
+
|
| 55 |
+
if len(self.buffer) == 0:
|
| 56 |
+
break
|
| 57 |
+
|
| 58 |
+
if nt == self.buffer[0][2]:
|
| 59 |
+
commit.append((na, nb, nt))
|
| 60 |
+
self.last_commited_word = nt
|
| 61 |
+
self.last_commited_time = nb
|
| 62 |
+
self.buffer.pop(0)
|
| 63 |
+
self.new.pop(0)
|
| 64 |
+
else:
|
| 65 |
+
break
|
| 66 |
+
self.buffer = self.new
|
| 67 |
+
self.new = []
|
| 68 |
+
self.commited_in_buffer.extend(commit)
|
| 69 |
+
return commit
|
| 70 |
+
|
| 71 |
+
def pop_commited(self, time):
|
| 72 |
+
while self.commited_in_buffer and self.commited_in_buffer[0][1] <= time:
|
| 73 |
+
self.commited_in_buffer.pop(0)
|
| 74 |
+
|
| 75 |
+
def complete(self):
|
| 76 |
+
return self.buffer
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
class OnlineASRProcessor:
|
| 80 |
+
|
| 81 |
+
SAMPLING_RATE = 16000
|
| 82 |
+
|
| 83 |
+
def __init__(
|
| 84 |
+
self,
|
| 85 |
+
asr,
|
| 86 |
+
tokenize_method=None,
|
| 87 |
+
buffer_trimming=("segment", 15),
|
| 88 |
+
logfile=sys.stderr,
|
| 89 |
+
):
|
| 90 |
+
"""asr: WhisperASR object
|
| 91 |
+
tokenize_method: sentence tokenizer function for the target language. Must be a callable and behaves like the one of MosesTokenizer. It can be None, if "segment" buffer trimming option is used, then tokenizer is not used at all.
|
| 92 |
+
("segment", 15)
|
| 93 |
+
buffer_trimming: a pair of (option, seconds), where option is either "sentence" or "segment", and seconds is a number. Buffer is trimmed if it is longer than "seconds" threshold. Default is the most recommended option.
|
| 94 |
+
logfile: where to store the log.
|
| 95 |
+
"""
|
| 96 |
+
self.asr = asr
|
| 97 |
+
self.tokenize = tokenize_method
|
| 98 |
+
self.logfile = logfile
|
| 99 |
+
|
| 100 |
+
self.init()
|
| 101 |
+
|
| 102 |
+
self.buffer_trimming_way, self.buffer_trimming_sec = buffer_trimming
|
| 103 |
+
|
| 104 |
+
def init(self, offset=None):
|
| 105 |
+
"""run this when starting or restarting processing"""
|
| 106 |
+
self.audio_buffer = np.array([], dtype=np.float32)
|
| 107 |
+
self.transcript_buffer = HypothesisBuffer(logfile=self.logfile)
|
| 108 |
+
self.buffer_time_offset = 0
|
| 109 |
+
if offset is not None:
|
| 110 |
+
self.buffer_time_offset = offset
|
| 111 |
+
self.transcript_buffer.last_commited_time = self.buffer_time_offset
|
| 112 |
+
self.commited = []
|
| 113 |
+
|
| 114 |
+
def insert_audio_chunk(self, audio):
|
| 115 |
+
self.audio_buffer = np.append(self.audio_buffer, audio)
|
| 116 |
+
|
| 117 |
+
def prompt(self):
|
| 118 |
+
"""Returns a tuple: (prompt, context), where "prompt" is a 200-character suffix of commited text that is inside of the scrolled away part of audio buffer.
|
| 119 |
+
"context" is the commited text that is inside the audio buffer. It is transcribed again and skipped. It is returned only for debugging and logging reasons.
|
| 120 |
+
"""
|
| 121 |
+
k = max(0, len(self.commited) - 1)
|
| 122 |
+
while k > 0 and self.commited[k - 1][1] > self.buffer_time_offset:
|
| 123 |
+
k -= 1
|
| 124 |
+
|
| 125 |
+
p = self.commited[:k]
|
| 126 |
+
p = [t for _, _, t in p]
|
| 127 |
+
prompt = []
|
| 128 |
+
l = 0
|
| 129 |
+
while p and l < 200: # 200 characters prompt size
|
| 130 |
+
x = p.pop(-1)
|
| 131 |
+
l += len(x) + 1
|
| 132 |
+
prompt.append(x)
|
| 133 |
+
non_prompt = self.commited[k:]
|
| 134 |
+
return self.asr.sep.join(prompt[::-1]), self.asr.sep.join(
|
| 135 |
+
t for _, _, t in non_prompt
|
| 136 |
+
)
|
| 137 |
+
|
| 138 |
+
def process_iter(self):
|
| 139 |
+
"""Runs on the current audio buffer.
|
| 140 |
+
Returns: a tuple (beg_timestamp, end_timestamp, "text"), or (None, None, "").
|
| 141 |
+
The non-emty text is confirmed (committed) partial transcript.
|
| 142 |
+
"""
|
| 143 |
+
|
| 144 |
+
prompt, non_prompt = self.prompt()
|
| 145 |
+
logger.debug(f"PROMPT: {prompt}")
|
| 146 |
+
logger.debug(f"CONTEXT: {non_prompt}")
|
| 147 |
+
logger.debug(
|
| 148 |
+
f"transcribing {len(self.audio_buffer)/self.SAMPLING_RATE:2.2f} seconds from {self.buffer_time_offset:2.2f}"
|
| 149 |
+
)
|
| 150 |
+
res = self.asr.transcribe(self.audio_buffer, init_prompt=prompt)
|
| 151 |
+
|
| 152 |
+
# transform to [(beg,end,"word1"), ...]
|
| 153 |
+
tsw = self.asr.ts_words(res)
|
| 154 |
+
|
| 155 |
+
self.transcript_buffer.insert(tsw, self.buffer_time_offset)
|
| 156 |
+
o = self.transcript_buffer.flush()
|
| 157 |
+
self.commited.extend(o)
|
| 158 |
+
completed = self.to_flush(o)
|
| 159 |
+
logger.debug(f">>>>COMPLETE NOW: {completed[2]}")
|
| 160 |
+
the_rest = self.to_flush(self.transcript_buffer.complete())
|
| 161 |
+
logger.debug(f"INCOMPLETE: {the_rest[2]}")
|
| 162 |
+
|
| 163 |
+
# there is a newly confirmed text
|
| 164 |
+
|
| 165 |
+
if o and self.buffer_trimming_way == "sentence": # trim the completed sentences
|
| 166 |
+
if (
|
| 167 |
+
len(self.audio_buffer) / self.SAMPLING_RATE > self.buffer_trimming_sec
|
| 168 |
+
): # longer than this
|
| 169 |
+
self.chunk_completed_sentence()
|
| 170 |
+
|
| 171 |
+
if self.buffer_trimming_way == "segment":
|
| 172 |
+
s = self.buffer_trimming_sec # trim the completed segments longer than s,
|
| 173 |
+
else:
|
| 174 |
+
s = 30 # if the audio buffer is longer than 30s, trim it
|
| 175 |
+
|
| 176 |
+
if len(self.audio_buffer) / self.SAMPLING_RATE > s:
|
| 177 |
+
self.chunk_completed_segment(res)
|
| 178 |
+
|
| 179 |
+
# alternative: on any word
|
| 180 |
+
# l = self.buffer_time_offset + len(self.audio_buffer)/self.SAMPLING_RATE - 10
|
| 181 |
+
# let's find commited word that is less
|
| 182 |
+
# k = len(self.commited)-1
|
| 183 |
+
# while k>0 and self.commited[k][1] > l:
|
| 184 |
+
# k -= 1
|
| 185 |
+
# t = self.commited[k][1]
|
| 186 |
+
logger.debug("chunking segment")
|
| 187 |
+
# self.chunk_at(t)
|
| 188 |
+
|
| 189 |
+
logger.debug(
|
| 190 |
+
f"len of buffer now: {len(self.audio_buffer)/self.SAMPLING_RATE:2.2f}"
|
| 191 |
+
)
|
| 192 |
+
return self.to_flush(o)
|
| 193 |
+
|
| 194 |
+
def chunk_completed_sentence(self):
|
| 195 |
+
if self.commited == []:
|
| 196 |
+
return
|
| 197 |
+
logger.debug("COMPLETED SENTENCE: ", [s[2] for s in self.commited])
|
| 198 |
+
sents = self.words_to_sentences(self.commited)
|
| 199 |
+
for s in sents:
|
| 200 |
+
logger.debug(f"\t\tSENT: {s}")
|
| 201 |
+
if len(sents) < 2:
|
| 202 |
+
return
|
| 203 |
+
while len(sents) > 2:
|
| 204 |
+
sents.pop(0)
|
| 205 |
+
# we will continue with audio processing at this timestamp
|
| 206 |
+
chunk_at = sents[-2][1]
|
| 207 |
+
|
| 208 |
+
logger.debug(f"--- sentence chunked at {chunk_at:2.2f}")
|
| 209 |
+
self.chunk_at(chunk_at)
|
| 210 |
+
|
| 211 |
+
def chunk_completed_segment(self, res):
|
| 212 |
+
if self.commited == []:
|
| 213 |
+
return
|
| 214 |
+
|
| 215 |
+
ends = self.asr.segments_end_ts(res)
|
| 216 |
+
|
| 217 |
+
t = self.commited[-1][1]
|
| 218 |
+
|
| 219 |
+
if len(ends) > 1:
|
| 220 |
+
|
| 221 |
+
e = ends[-2] + self.buffer_time_offset
|
| 222 |
+
while len(ends) > 2 and e > t:
|
| 223 |
+
ends.pop(-1)
|
| 224 |
+
e = ends[-2] + self.buffer_time_offset
|
| 225 |
+
if e <= t:
|
| 226 |
+
logger.debug(f"--- segment chunked at {e:2.2f}")
|
| 227 |
+
self.chunk_at(e)
|
| 228 |
+
else:
|
| 229 |
+
logger.debug(f"--- last segment not within commited area")
|
| 230 |
+
else:
|
| 231 |
+
logger.debug(f"--- not enough segments to chunk")
|
| 232 |
+
|
| 233 |
+
def chunk_at(self, time):
|
| 234 |
+
"""trims the hypothesis and audio buffer at "time" """
|
| 235 |
+
self.transcript_buffer.pop_commited(time)
|
| 236 |
+
cut_seconds = time - self.buffer_time_offset
|
| 237 |
+
self.audio_buffer = self.audio_buffer[int(cut_seconds * self.SAMPLING_RATE) :]
|
| 238 |
+
self.buffer_time_offset = time
|
| 239 |
+
|
| 240 |
+
def words_to_sentences(self, words):
|
| 241 |
+
"""Uses self.tokenize for sentence segmentation of words.
|
| 242 |
+
Returns: [(beg,end,"sentence 1"),...]
|
| 243 |
+
"""
|
| 244 |
+
|
| 245 |
+
cwords = [w for w in words]
|
| 246 |
+
t = " ".join(o[2] for o in cwords)
|
| 247 |
+
s = self.tokenize(t)
|
| 248 |
+
out = []
|
| 249 |
+
while s:
|
| 250 |
+
beg = None
|
| 251 |
+
end = None
|
| 252 |
+
sent = s.pop(0).strip()
|
| 253 |
+
fsent = sent
|
| 254 |
+
while cwords:
|
| 255 |
+
b, e, w = cwords.pop(0)
|
| 256 |
+
w = w.strip()
|
| 257 |
+
if beg is None and sent.startswith(w):
|
| 258 |
+
beg = b
|
| 259 |
+
elif end is None and sent == w:
|
| 260 |
+
end = e
|
| 261 |
+
out.append((beg, end, fsent))
|
| 262 |
+
break
|
| 263 |
+
sent = sent[len(w) :].strip()
|
| 264 |
+
return out
|
| 265 |
+
|
| 266 |
+
def finish(self):
|
| 267 |
+
"""Flush the incomplete text when the whole processing ends.
|
| 268 |
+
Returns: the same format as self.process_iter()
|
| 269 |
+
"""
|
| 270 |
+
o = self.transcript_buffer.complete()
|
| 271 |
+
f = self.to_flush(o)
|
| 272 |
+
logger.debug(f"last, noncommited: {f}")
|
| 273 |
+
self.buffer_time_offset += len(self.audio_buffer) / 16000
|
| 274 |
+
return f
|
| 275 |
+
|
| 276 |
+
def to_flush(
|
| 277 |
+
self,
|
| 278 |
+
sents,
|
| 279 |
+
sep=None,
|
| 280 |
+
offset=0,
|
| 281 |
+
):
|
| 282 |
+
# concatenates the timestamped words or sentences into one sequence that is flushed in one line
|
| 283 |
+
# sents: [(beg1, end1, "sentence1"), ...] or [] if empty
|
| 284 |
+
# return: (beg1,end-of-last-sentence,"concatenation of sentences") or (None, None, "") if empty
|
| 285 |
+
if sep is None:
|
| 286 |
+
sep = self.asr.sep
|
| 287 |
+
t = sep.join(s[2] for s in sents)
|
| 288 |
+
if len(sents) == 0:
|
| 289 |
+
b = None
|
| 290 |
+
e = None
|
| 291 |
+
else:
|
| 292 |
+
b = offset + sents[0][0]
|
| 293 |
+
e = offset + sents[-1][1]
|
| 294 |
+
return (b, e, t)
|
| 295 |
+
|
| 296 |
+
|
| 297 |
+
class VACOnlineASRProcessor(OnlineASRProcessor):
|
| 298 |
+
"""Wraps OnlineASRProcessor with VAC (Voice Activity Controller).
|
| 299 |
+
|
| 300 |
+
It works the same way as OnlineASRProcessor: it receives chunks of audio (e.g. 0.04 seconds),
|
| 301 |
+
it runs VAD and continuously detects whether there is speech or not.
|
| 302 |
+
When it detects end of speech (non-voice for 500ms), it makes OnlineASRProcessor to end the utterance immediately.
|
| 303 |
+
"""
|
| 304 |
+
|
| 305 |
+
def __init__(self, online_chunk_size, *a, **kw):
|
| 306 |
+
self.online_chunk_size = online_chunk_size
|
| 307 |
+
|
| 308 |
+
self.online = OnlineASRProcessor(*a, **kw)
|
| 309 |
+
|
| 310 |
+
# VAC:
|
| 311 |
+
import torch
|
| 312 |
+
|
| 313 |
+
model, _ = torch.hub.load(repo_or_dir="snakers4/silero-vad", model="silero_vad")
|
| 314 |
+
from silero_vad_iterator import FixedVADIterator
|
| 315 |
+
|
| 316 |
+
self.vac = FixedVADIterator(
|
| 317 |
+
model
|
| 318 |
+
) # we use the default options there: 500ms silence, 100ms padding, etc.
|
| 319 |
+
|
| 320 |
+
self.logfile = self.online.logfile
|
| 321 |
+
self.init()
|
| 322 |
+
|
| 323 |
+
def init(self):
|
| 324 |
+
self.online.init()
|
| 325 |
+
self.vac.reset_states()
|
| 326 |
+
self.current_online_chunk_buffer_size = 0
|
| 327 |
+
|
| 328 |
+
self.is_currently_final = False
|
| 329 |
+
|
| 330 |
+
self.status = None # or "voice" or "nonvoice"
|
| 331 |
+
self.audio_buffer = np.array([], dtype=np.float32)
|
| 332 |
+
self.buffer_offset = 0 # in frames
|
| 333 |
+
|
| 334 |
+
def clear_buffer(self):
|
| 335 |
+
self.buffer_offset += len(self.audio_buffer)
|
| 336 |
+
self.audio_buffer = np.array([], dtype=np.float32)
|
| 337 |
+
|
| 338 |
+
def insert_audio_chunk(self, audio):
|
| 339 |
+
res = self.vac(audio)
|
| 340 |
+
self.audio_buffer = np.append(self.audio_buffer, audio)
|
| 341 |
+
|
| 342 |
+
if res is not None:
|
| 343 |
+
frame = list(res.values())[0] - self.buffer_offset
|
| 344 |
+
if "start" in res and "end" not in res:
|
| 345 |
+
self.status = "voice"
|
| 346 |
+
send_audio = self.audio_buffer[frame:]
|
| 347 |
+
self.online.init(
|
| 348 |
+
offset=(frame + self.buffer_offset) / self.SAMPLING_RATE
|
| 349 |
+
)
|
| 350 |
+
self.online.insert_audio_chunk(send_audio)
|
| 351 |
+
self.current_online_chunk_buffer_size += len(send_audio)
|
| 352 |
+
self.clear_buffer()
|
| 353 |
+
elif "end" in res and "start" not in res:
|
| 354 |
+
self.status = "nonvoice"
|
| 355 |
+
send_audio = self.audio_buffer[:frame]
|
| 356 |
+
self.online.insert_audio_chunk(send_audio)
|
| 357 |
+
self.current_online_chunk_buffer_size += len(send_audio)
|
| 358 |
+
self.is_currently_final = True
|
| 359 |
+
self.clear_buffer()
|
| 360 |
+
else:
|
| 361 |
+
beg = res["start"] - self.buffer_offset
|
| 362 |
+
end = res["end"] - self.buffer_offset
|
| 363 |
+
self.status = "nonvoice"
|
| 364 |
+
send_audio = self.audio_buffer[beg:end]
|
| 365 |
+
self.online.init(offset=(beg + self.buffer_offset) / self.SAMPLING_RATE)
|
| 366 |
+
self.online.insert_audio_chunk(send_audio)
|
| 367 |
+
self.current_online_chunk_buffer_size += len(send_audio)
|
| 368 |
+
self.is_currently_final = True
|
| 369 |
+
self.clear_buffer()
|
| 370 |
+
else:
|
| 371 |
+
if self.status == "voice":
|
| 372 |
+
self.online.insert_audio_chunk(self.audio_buffer)
|
| 373 |
+
self.current_online_chunk_buffer_size += len(self.audio_buffer)
|
| 374 |
+
self.clear_buffer()
|
| 375 |
+
else:
|
| 376 |
+
# We keep 1 second because VAD may later find start of voice in it.
|
| 377 |
+
# But we trim it to prevent OOM.
|
| 378 |
+
self.buffer_offset += max(
|
| 379 |
+
0, len(self.audio_buffer) - self.SAMPLING_RATE
|
| 380 |
+
)
|
| 381 |
+
self.audio_buffer = self.audio_buffer[-self.SAMPLING_RATE :]
|
| 382 |
+
|
| 383 |
+
def process_iter(self):
|
| 384 |
+
if self.is_currently_final:
|
| 385 |
+
return self.finish()
|
| 386 |
+
elif (
|
| 387 |
+
self.current_online_chunk_buffer_size
|
| 388 |
+
> self.SAMPLING_RATE * self.online_chunk_size
|
| 389 |
+
):
|
| 390 |
+
self.current_online_chunk_buffer_size = 0
|
| 391 |
+
ret = self.online.process_iter()
|
| 392 |
+
return ret
|
| 393 |
+
else:
|
| 394 |
+
print("no online update, only VAD", self.status, file=self.logfile)
|
| 395 |
+
return (None, None, "")
|
| 396 |
+
|
| 397 |
+
def finish(self):
|
| 398 |
+
ret = self.online.finish()
|
| 399 |
+
self.current_online_chunk_buffer_size = 0
|
| 400 |
+
self.is_currently_final = False
|
| 401 |
+
return ret
|
whisper_fastapi_online_server.py
CHANGED
|
@@ -43,7 +43,7 @@ args = parser.parse_args()
|
|
| 43 |
asr, tokenizer = backend_factory(args)
|
| 44 |
|
| 45 |
# Load demo HTML for the root endpoint
|
| 46 |
-
with open("src/live_transcription.html", "r", encoding="utf-8") as f:
|
| 47 |
html = f.read()
|
| 48 |
|
| 49 |
|
|
|
|
| 43 |
asr, tokenizer = backend_factory(args)
|
| 44 |
|
| 45 |
# Load demo HTML for the root endpoint
|
| 46 |
+
with open("src/web/live_transcription.html", "r", encoding="utf-8") as f:
|
| 47 |
html = f.read()
|
| 48 |
|
| 49 |
|
whisper_online.py
CHANGED
|
@@ -5,10 +5,8 @@ import librosa
|
|
| 5 |
from functools import lru_cache
|
| 6 |
import time
|
| 7 |
import logging
|
| 8 |
-
|
| 9 |
-
import
|
| 10 |
-
import soundfile as sf
|
| 11 |
-
import math
|
| 12 |
|
| 13 |
logger = logging.getLogger(__name__)
|
| 14 |
|
|
@@ -25,768 +23,6 @@ def load_audio_chunk(fname, beg, end):
|
|
| 25 |
end_s = int(end * 16000)
|
| 26 |
return audio[beg_s:end_s]
|
| 27 |
|
| 28 |
-
|
| 29 |
-
# Whisper backend
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
class ASRBase:
|
| 33 |
-
|
| 34 |
-
sep = " " # join transcribe words with this character (" " for whisper_timestamped,
|
| 35 |
-
# "" for faster-whisper because it emits the spaces when neeeded)
|
| 36 |
-
|
| 37 |
-
def __init__(
|
| 38 |
-
self, lan, modelsize=None, cache_dir=None, model_dir=None, logfile=sys.stderr
|
| 39 |
-
):
|
| 40 |
-
self.logfile = logfile
|
| 41 |
-
|
| 42 |
-
self.transcribe_kargs = {}
|
| 43 |
-
if lan == "auto":
|
| 44 |
-
self.original_language = None
|
| 45 |
-
else:
|
| 46 |
-
self.original_language = lan
|
| 47 |
-
|
| 48 |
-
self.model = self.load_model(modelsize, cache_dir, model_dir)
|
| 49 |
-
|
| 50 |
-
def load_model(self, modelsize, cache_dir):
|
| 51 |
-
raise NotImplemented("must be implemented in the child class")
|
| 52 |
-
|
| 53 |
-
def transcribe(self, audio, init_prompt=""):
|
| 54 |
-
raise NotImplemented("must be implemented in the child class")
|
| 55 |
-
|
| 56 |
-
def use_vad(self):
|
| 57 |
-
raise NotImplemented("must be implemented in the child class")
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
class WhisperTimestampedASR(ASRBase):
|
| 61 |
-
"""Uses whisper_timestamped library as the backend. Initially, we tested the code on this backend. It worked, but slower than faster-whisper.
|
| 62 |
-
On the other hand, the installation for GPU could be easier.
|
| 63 |
-
"""
|
| 64 |
-
|
| 65 |
-
sep = " "
|
| 66 |
-
|
| 67 |
-
def load_model(self, modelsize=None, cache_dir=None, model_dir=None):
|
| 68 |
-
import whisper
|
| 69 |
-
import whisper_timestamped
|
| 70 |
-
from whisper_timestamped import transcribe_timestamped
|
| 71 |
-
|
| 72 |
-
self.transcribe_timestamped = transcribe_timestamped
|
| 73 |
-
if model_dir is not None:
|
| 74 |
-
logger.debug("ignoring model_dir, not implemented")
|
| 75 |
-
return whisper.load_model(modelsize, download_root=cache_dir)
|
| 76 |
-
|
| 77 |
-
def transcribe(self, audio, init_prompt=""):
|
| 78 |
-
result = self.transcribe_timestamped(
|
| 79 |
-
self.model,
|
| 80 |
-
audio,
|
| 81 |
-
language=self.original_language,
|
| 82 |
-
initial_prompt=init_prompt,
|
| 83 |
-
verbose=None,
|
| 84 |
-
condition_on_previous_text=True,
|
| 85 |
-
**self.transcribe_kargs,
|
| 86 |
-
)
|
| 87 |
-
return result
|
| 88 |
-
|
| 89 |
-
def ts_words(self, r):
|
| 90 |
-
# return: transcribe result object to [(beg,end,"word1"), ...]
|
| 91 |
-
o = []
|
| 92 |
-
for s in r["segments"]:
|
| 93 |
-
for w in s["words"]:
|
| 94 |
-
t = (w["start"], w["end"], w["text"])
|
| 95 |
-
o.append(t)
|
| 96 |
-
return o
|
| 97 |
-
|
| 98 |
-
def segments_end_ts(self, res):
|
| 99 |
-
return [s["end"] for s in res["segments"]]
|
| 100 |
-
|
| 101 |
-
def use_vad(self):
|
| 102 |
-
self.transcribe_kargs["vad"] = True
|
| 103 |
-
|
| 104 |
-
def set_translate_task(self):
|
| 105 |
-
self.transcribe_kargs["task"] = "translate"
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
class FasterWhisperASR(ASRBase):
|
| 109 |
-
"""Uses faster-whisper library as the backend. Works much faster, appx 4-times (in offline mode). For GPU, it requires installation with a specific CUDNN version."""
|
| 110 |
-
|
| 111 |
-
sep = ""
|
| 112 |
-
|
| 113 |
-
def load_model(self, modelsize=None, cache_dir=None, model_dir=None):
|
| 114 |
-
from faster_whisper import WhisperModel
|
| 115 |
-
|
| 116 |
-
# logging.getLogger("faster_whisper").setLevel(logger.level)
|
| 117 |
-
if model_dir is not None:
|
| 118 |
-
logger.debug(
|
| 119 |
-
f"Loading whisper model from model_dir {model_dir}. modelsize and cache_dir parameters are not used."
|
| 120 |
-
)
|
| 121 |
-
model_size_or_path = model_dir
|
| 122 |
-
elif modelsize is not None:
|
| 123 |
-
model_size_or_path = modelsize
|
| 124 |
-
else:
|
| 125 |
-
raise ValueError("modelsize or model_dir parameter must be set")
|
| 126 |
-
|
| 127 |
-
# this worked fast and reliably on NVIDIA L40
|
| 128 |
-
model = WhisperModel(
|
| 129 |
-
model_size_or_path,
|
| 130 |
-
device="cuda",
|
| 131 |
-
compute_type="float16",
|
| 132 |
-
download_root=cache_dir,
|
| 133 |
-
)
|
| 134 |
-
|
| 135 |
-
# or run on GPU with INT8
|
| 136 |
-
# tested: the transcripts were different, probably worse than with FP16, and it was slightly (appx 20%) slower
|
| 137 |
-
# model = WhisperModel(model_size, device="cuda", compute_type="int8_float16")
|
| 138 |
-
|
| 139 |
-
# or run on CPU with INT8
|
| 140 |
-
# tested: works, but slow, appx 10-times than cuda FP16
|
| 141 |
-
# model = WhisperModel(modelsize, device="cpu", compute_type="int8") #, download_root="faster-disk-cache-dir/")
|
| 142 |
-
return model
|
| 143 |
-
|
| 144 |
-
def transcribe(self, audio, init_prompt=""):
|
| 145 |
-
|
| 146 |
-
# tested: beam_size=5 is faster and better than 1 (on one 200 second document from En ESIC, min chunk 0.01)
|
| 147 |
-
segments, info = self.model.transcribe(
|
| 148 |
-
audio,
|
| 149 |
-
language=self.original_language,
|
| 150 |
-
initial_prompt=init_prompt,
|
| 151 |
-
beam_size=5,
|
| 152 |
-
word_timestamps=True,
|
| 153 |
-
condition_on_previous_text=True,
|
| 154 |
-
**self.transcribe_kargs,
|
| 155 |
-
)
|
| 156 |
-
# print(info) # info contains language detection result
|
| 157 |
-
|
| 158 |
-
return list(segments)
|
| 159 |
-
|
| 160 |
-
def ts_words(self, segments):
|
| 161 |
-
o = []
|
| 162 |
-
for segment in segments:
|
| 163 |
-
for word in segment.words:
|
| 164 |
-
if segment.no_speech_prob > 0.9:
|
| 165 |
-
continue
|
| 166 |
-
# not stripping the spaces -- should not be merged with them!
|
| 167 |
-
w = word.word
|
| 168 |
-
t = (word.start, word.end, w)
|
| 169 |
-
o.append(t)
|
| 170 |
-
return o
|
| 171 |
-
|
| 172 |
-
def segments_end_ts(self, res):
|
| 173 |
-
return [s.end for s in res]
|
| 174 |
-
|
| 175 |
-
def use_vad(self):
|
| 176 |
-
self.transcribe_kargs["vad_filter"] = True
|
| 177 |
-
|
| 178 |
-
def set_translate_task(self):
|
| 179 |
-
self.transcribe_kargs["task"] = "translate"
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
class MLXWhisper(ASRBase):
|
| 183 |
-
"""
|
| 184 |
-
Uses MPX Whisper library as the backend, optimized for Apple Silicon.
|
| 185 |
-
Models available: https://huggingface.co/collections/mlx-community/whisper-663256f9964fbb1177db93dc
|
| 186 |
-
Significantly faster than faster-whisper (without CUDA) on Apple M1.
|
| 187 |
-
"""
|
| 188 |
-
|
| 189 |
-
sep = " "
|
| 190 |
-
|
| 191 |
-
def load_model(self, modelsize=None, cache_dir=None, model_dir=None):
|
| 192 |
-
"""
|
| 193 |
-
Loads the MLX-compatible Whisper model.
|
| 194 |
-
|
| 195 |
-
Args:
|
| 196 |
-
modelsize (str, optional): The size or name of the Whisper model to load.
|
| 197 |
-
If provided, it will be translated to an MLX-compatible model path using the `translate_model_name` method.
|
| 198 |
-
Example: "large-v3-turbo" -> "mlx-community/whisper-large-v3-turbo".
|
| 199 |
-
cache_dir (str, optional): Path to the directory for caching models.
|
| 200 |
-
**Note**: This is not supported by MLX Whisper and will be ignored.
|
| 201 |
-
model_dir (str, optional): Direct path to a custom model directory.
|
| 202 |
-
If specified, it overrides the `modelsize` parameter.
|
| 203 |
-
"""
|
| 204 |
-
from mlx_whisper.transcribe import ModelHolder, transcribe
|
| 205 |
-
import mlx.core as mx
|
| 206 |
-
|
| 207 |
-
if model_dir is not None:
|
| 208 |
-
logger.debug(
|
| 209 |
-
f"Loading whisper model from model_dir {model_dir}. modelsize parameter is not used."
|
| 210 |
-
)
|
| 211 |
-
model_size_or_path = model_dir
|
| 212 |
-
elif modelsize is not None:
|
| 213 |
-
model_size_or_path = self.translate_model_name(modelsize)
|
| 214 |
-
logger.debug(
|
| 215 |
-
f"Loading whisper model {modelsize}. You use mlx whisper, so {model_size_or_path} will be used."
|
| 216 |
-
)
|
| 217 |
-
|
| 218 |
-
self.model_size_or_path = model_size_or_path
|
| 219 |
-
|
| 220 |
-
# In mlx_whisper.transcribe, dtype is defined as:
|
| 221 |
-
# dtype = mx.float16 if decode_options.get("fp16", True) else mx.float32
|
| 222 |
-
# Since we do not use decode_options in self.transcribe, we will set dtype to mx.float16
|
| 223 |
-
dtype = mx.float16
|
| 224 |
-
ModelHolder.get_model(model_size_or_path, dtype)
|
| 225 |
-
return transcribe
|
| 226 |
-
|
| 227 |
-
def translate_model_name(self, model_name):
|
| 228 |
-
"""
|
| 229 |
-
Translates a given model name to its corresponding MLX-compatible model path.
|
| 230 |
-
|
| 231 |
-
Args:
|
| 232 |
-
model_name (str): The name of the model to translate.
|
| 233 |
-
|
| 234 |
-
Returns:
|
| 235 |
-
str: The MLX-compatible model path.
|
| 236 |
-
"""
|
| 237 |
-
# Dictionary mapping model names to MLX-compatible paths
|
| 238 |
-
model_mapping = {
|
| 239 |
-
"tiny.en": "mlx-community/whisper-tiny.en-mlx",
|
| 240 |
-
"tiny": "mlx-community/whisper-tiny-mlx",
|
| 241 |
-
"base.en": "mlx-community/whisper-base.en-mlx",
|
| 242 |
-
"base": "mlx-community/whisper-base-mlx",
|
| 243 |
-
"small.en": "mlx-community/whisper-small.en-mlx",
|
| 244 |
-
"small": "mlx-community/whisper-small-mlx",
|
| 245 |
-
"medium.en": "mlx-community/whisper-medium.en-mlx",
|
| 246 |
-
"medium": "mlx-community/whisper-medium-mlx",
|
| 247 |
-
"large-v1": "mlx-community/whisper-large-v1-mlx",
|
| 248 |
-
"large-v2": "mlx-community/whisper-large-v2-mlx",
|
| 249 |
-
"large-v3": "mlx-community/whisper-large-v3-mlx",
|
| 250 |
-
"large-v3-turbo": "mlx-community/whisper-large-v3-turbo",
|
| 251 |
-
"large": "mlx-community/whisper-large-mlx",
|
| 252 |
-
}
|
| 253 |
-
|
| 254 |
-
# Retrieve the corresponding MLX model path
|
| 255 |
-
mlx_model_path = model_mapping.get(model_name)
|
| 256 |
-
|
| 257 |
-
if mlx_model_path:
|
| 258 |
-
return mlx_model_path
|
| 259 |
-
else:
|
| 260 |
-
raise ValueError(
|
| 261 |
-
f"Model name '{model_name}' is not recognized or not supported."
|
| 262 |
-
)
|
| 263 |
-
|
| 264 |
-
def transcribe(self, audio, init_prompt=""):
|
| 265 |
-
if self.transcribe_kargs:
|
| 266 |
-
logger.warning("Transcribe kwargs (vad, task) are not compatible with MLX Whisper and will be ignored.")
|
| 267 |
-
segments = self.model(
|
| 268 |
-
audio,
|
| 269 |
-
language=self.original_language,
|
| 270 |
-
initial_prompt=init_prompt,
|
| 271 |
-
word_timestamps=True,
|
| 272 |
-
condition_on_previous_text=True,
|
| 273 |
-
path_or_hf_repo=self.model_size_or_path,
|
| 274 |
-
)
|
| 275 |
-
return segments.get("segments", [])
|
| 276 |
-
|
| 277 |
-
def ts_words(self, segments):
|
| 278 |
-
"""
|
| 279 |
-
Extract timestamped words from transcription segments and skips words with high no-speech probability.
|
| 280 |
-
"""
|
| 281 |
-
return [
|
| 282 |
-
(word["start"], word["end"], word["word"])
|
| 283 |
-
for segment in segments
|
| 284 |
-
for word in segment.get("words", [])
|
| 285 |
-
if segment.get("no_speech_prob", 0) <= 0.9
|
| 286 |
-
]
|
| 287 |
-
|
| 288 |
-
def segments_end_ts(self, res):
|
| 289 |
-
return [s["end"] for s in res]
|
| 290 |
-
|
| 291 |
-
def use_vad(self):
|
| 292 |
-
self.transcribe_kargs["vad_filter"] = True
|
| 293 |
-
|
| 294 |
-
def set_translate_task(self):
|
| 295 |
-
self.transcribe_kargs["task"] = "translate"
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
class OpenaiApiASR(ASRBase):
|
| 299 |
-
"""Uses OpenAI's Whisper API for audio transcription."""
|
| 300 |
-
|
| 301 |
-
def __init__(self, lan=None, temperature=0, logfile=sys.stderr):
|
| 302 |
-
self.logfile = logfile
|
| 303 |
-
|
| 304 |
-
self.modelname = "whisper-1"
|
| 305 |
-
self.original_language = (
|
| 306 |
-
None if lan == "auto" else lan
|
| 307 |
-
) # ISO-639-1 language code
|
| 308 |
-
self.response_format = "verbose_json"
|
| 309 |
-
self.temperature = temperature
|
| 310 |
-
|
| 311 |
-
self.load_model()
|
| 312 |
-
|
| 313 |
-
self.use_vad_opt = False
|
| 314 |
-
|
| 315 |
-
# reset the task in set_translate_task
|
| 316 |
-
self.task = "transcribe"
|
| 317 |
-
|
| 318 |
-
def load_model(self, *args, **kwargs):
|
| 319 |
-
from openai import OpenAI
|
| 320 |
-
|
| 321 |
-
self.client = OpenAI()
|
| 322 |
-
|
| 323 |
-
self.transcribed_seconds = (
|
| 324 |
-
0 # for logging how many seconds were processed by API, to know the cost
|
| 325 |
-
)
|
| 326 |
-
|
| 327 |
-
def ts_words(self, segments):
|
| 328 |
-
no_speech_segments = []
|
| 329 |
-
if self.use_vad_opt:
|
| 330 |
-
for segment in segments.segments:
|
| 331 |
-
# TODO: threshold can be set from outside
|
| 332 |
-
if segment["no_speech_prob"] > 0.8:
|
| 333 |
-
no_speech_segments.append(
|
| 334 |
-
(segment.get("start"), segment.get("end"))
|
| 335 |
-
)
|
| 336 |
-
|
| 337 |
-
o = []
|
| 338 |
-
for word in segments.words:
|
| 339 |
-
start = word.start
|
| 340 |
-
end = word.end
|
| 341 |
-
if any(s[0] <= start <= s[1] for s in no_speech_segments):
|
| 342 |
-
# print("Skipping word", word.get("word"), "because it's in a no-speech segment")
|
| 343 |
-
continue
|
| 344 |
-
o.append((start, end, word.word))
|
| 345 |
-
return o
|
| 346 |
-
|
| 347 |
-
def segments_end_ts(self, res):
|
| 348 |
-
return [s.end for s in res.words]
|
| 349 |
-
|
| 350 |
-
def transcribe(self, audio_data, prompt=None, *args, **kwargs):
|
| 351 |
-
# Write the audio data to a buffer
|
| 352 |
-
buffer = io.BytesIO()
|
| 353 |
-
buffer.name = "temp.wav"
|
| 354 |
-
sf.write(buffer, audio_data, samplerate=16000, format="WAV", subtype="PCM_16")
|
| 355 |
-
buffer.seek(0) # Reset buffer's position to the beginning
|
| 356 |
-
|
| 357 |
-
self.transcribed_seconds += math.ceil(
|
| 358 |
-
len(audio_data) / 16000
|
| 359 |
-
) # it rounds up to the whole seconds
|
| 360 |
-
|
| 361 |
-
params = {
|
| 362 |
-
"model": self.modelname,
|
| 363 |
-
"file": buffer,
|
| 364 |
-
"response_format": self.response_format,
|
| 365 |
-
"temperature": self.temperature,
|
| 366 |
-
"timestamp_granularities": ["word", "segment"],
|
| 367 |
-
}
|
| 368 |
-
if self.task != "translate" and self.original_language:
|
| 369 |
-
params["language"] = self.original_language
|
| 370 |
-
if prompt:
|
| 371 |
-
params["prompt"] = prompt
|
| 372 |
-
|
| 373 |
-
if self.task == "translate":
|
| 374 |
-
proc = self.client.audio.translations
|
| 375 |
-
else:
|
| 376 |
-
proc = self.client.audio.transcriptions
|
| 377 |
-
|
| 378 |
-
# Process transcription/translation
|
| 379 |
-
transcript = proc.create(**params)
|
| 380 |
-
logger.debug(
|
| 381 |
-
f"OpenAI API processed accumulated {self.transcribed_seconds} seconds"
|
| 382 |
-
)
|
| 383 |
-
|
| 384 |
-
return transcript
|
| 385 |
-
|
| 386 |
-
def use_vad(self):
|
| 387 |
-
self.use_vad_opt = True
|
| 388 |
-
|
| 389 |
-
def set_translate_task(self):
|
| 390 |
-
self.task = "translate"
|
| 391 |
-
|
| 392 |
-
|
| 393 |
-
class HypothesisBuffer:
|
| 394 |
-
|
| 395 |
-
def __init__(self, logfile=sys.stderr):
|
| 396 |
-
self.commited_in_buffer = []
|
| 397 |
-
self.buffer = []
|
| 398 |
-
self.new = []
|
| 399 |
-
|
| 400 |
-
self.last_commited_time = 0
|
| 401 |
-
self.last_commited_word = None
|
| 402 |
-
|
| 403 |
-
self.logfile = logfile
|
| 404 |
-
|
| 405 |
-
def insert(self, new, offset):
|
| 406 |
-
# compare self.commited_in_buffer and new. It inserts only the words in new that extend the commited_in_buffer, it means they are roughly behind last_commited_time and new in content
|
| 407 |
-
# the new tail is added to self.new
|
| 408 |
-
|
| 409 |
-
new = [(a + offset, b + offset, t) for a, b, t in new]
|
| 410 |
-
self.new = [(a, b, t) for a, b, t in new if a > self.last_commited_time - 0.1]
|
| 411 |
-
|
| 412 |
-
if len(self.new) >= 1:
|
| 413 |
-
a, b, t = self.new[0]
|
| 414 |
-
if abs(a - self.last_commited_time) < 1:
|
| 415 |
-
if self.commited_in_buffer:
|
| 416 |
-
# it's going to search for 1, 2, ..., 5 consecutive words (n-grams) that are identical in commited and new. If they are, they're dropped.
|
| 417 |
-
cn = len(self.commited_in_buffer)
|
| 418 |
-
nn = len(self.new)
|
| 419 |
-
for i in range(1, min(min(cn, nn), 5) + 1): # 5 is the maximum
|
| 420 |
-
c = " ".join(
|
| 421 |
-
[self.commited_in_buffer[-j][2] for j in range(1, i + 1)][
|
| 422 |
-
::-1
|
| 423 |
-
]
|
| 424 |
-
)
|
| 425 |
-
tail = " ".join(self.new[j - 1][2] for j in range(1, i + 1))
|
| 426 |
-
if c == tail:
|
| 427 |
-
words = []
|
| 428 |
-
for j in range(i):
|
| 429 |
-
words.append(repr(self.new.pop(0)))
|
| 430 |
-
words_msg = " ".join(words)
|
| 431 |
-
logger.debug(f"removing last {i} words: {words_msg}")
|
| 432 |
-
break
|
| 433 |
-
|
| 434 |
-
def flush(self):
|
| 435 |
-
# returns commited chunk = the longest common prefix of 2 last inserts.
|
| 436 |
-
|
| 437 |
-
commit = []
|
| 438 |
-
while self.new:
|
| 439 |
-
na, nb, nt = self.new[0]
|
| 440 |
-
|
| 441 |
-
if len(self.buffer) == 0:
|
| 442 |
-
break
|
| 443 |
-
|
| 444 |
-
if nt == self.buffer[0][2]:
|
| 445 |
-
commit.append((na, nb, nt))
|
| 446 |
-
self.last_commited_word = nt
|
| 447 |
-
self.last_commited_time = nb
|
| 448 |
-
self.buffer.pop(0)
|
| 449 |
-
self.new.pop(0)
|
| 450 |
-
else:
|
| 451 |
-
break
|
| 452 |
-
self.buffer = self.new
|
| 453 |
-
self.new = []
|
| 454 |
-
self.commited_in_buffer.extend(commit)
|
| 455 |
-
return commit
|
| 456 |
-
|
| 457 |
-
def pop_commited(self, time):
|
| 458 |
-
while self.commited_in_buffer and self.commited_in_buffer[0][1] <= time:
|
| 459 |
-
self.commited_in_buffer.pop(0)
|
| 460 |
-
|
| 461 |
-
def complete(self):
|
| 462 |
-
return self.buffer
|
| 463 |
-
|
| 464 |
-
|
| 465 |
-
class OnlineASRProcessor:
|
| 466 |
-
|
| 467 |
-
SAMPLING_RATE = 16000
|
| 468 |
-
|
| 469 |
-
def __init__(
|
| 470 |
-
self,
|
| 471 |
-
asr,
|
| 472 |
-
tokenize_method=None,
|
| 473 |
-
buffer_trimming=("segment", 15),
|
| 474 |
-
logfile=sys.stderr,
|
| 475 |
-
):
|
| 476 |
-
"""asr: WhisperASR object
|
| 477 |
-
tokenize_method: sentence tokenizer function for the target language. Must be a callable and behaves like the one of MosesTokenizer. It can be None, if "segment" buffer trimming option is used, then tokenizer is not used at all.
|
| 478 |
-
("segment", 15)
|
| 479 |
-
buffer_trimming: a pair of (option, seconds), where option is either "sentence" or "segment", and seconds is a number. Buffer is trimmed if it is longer than "seconds" threshold. Default is the most recommended option.
|
| 480 |
-
logfile: where to store the log.
|
| 481 |
-
"""
|
| 482 |
-
self.asr = asr
|
| 483 |
-
self.tokenize = tokenize_method
|
| 484 |
-
self.logfile = logfile
|
| 485 |
-
|
| 486 |
-
self.init()
|
| 487 |
-
|
| 488 |
-
self.buffer_trimming_way, self.buffer_trimming_sec = buffer_trimming
|
| 489 |
-
|
| 490 |
-
def init(self, offset=None):
|
| 491 |
-
"""run this when starting or restarting processing"""
|
| 492 |
-
self.audio_buffer = np.array([], dtype=np.float32)
|
| 493 |
-
self.transcript_buffer = HypothesisBuffer(logfile=self.logfile)
|
| 494 |
-
self.buffer_time_offset = 0
|
| 495 |
-
if offset is not None:
|
| 496 |
-
self.buffer_time_offset = offset
|
| 497 |
-
self.transcript_buffer.last_commited_time = self.buffer_time_offset
|
| 498 |
-
self.commited = []
|
| 499 |
-
|
| 500 |
-
def insert_audio_chunk(self, audio):
|
| 501 |
-
self.audio_buffer = np.append(self.audio_buffer, audio)
|
| 502 |
-
|
| 503 |
-
def prompt(self):
|
| 504 |
-
"""Returns a tuple: (prompt, context), where "prompt" is a 200-character suffix of commited text that is inside of the scrolled away part of audio buffer.
|
| 505 |
-
"context" is the commited text that is inside the audio buffer. It is transcribed again and skipped. It is returned only for debugging and logging reasons.
|
| 506 |
-
"""
|
| 507 |
-
k = max(0, len(self.commited) - 1)
|
| 508 |
-
while k > 0 and self.commited[k - 1][1] > self.buffer_time_offset:
|
| 509 |
-
k -= 1
|
| 510 |
-
|
| 511 |
-
p = self.commited[:k]
|
| 512 |
-
p = [t for _, _, t in p]
|
| 513 |
-
prompt = []
|
| 514 |
-
l = 0
|
| 515 |
-
while p and l < 200: # 200 characters prompt size
|
| 516 |
-
x = p.pop(-1)
|
| 517 |
-
l += len(x) + 1
|
| 518 |
-
prompt.append(x)
|
| 519 |
-
non_prompt = self.commited[k:]
|
| 520 |
-
return self.asr.sep.join(prompt[::-1]), self.asr.sep.join(
|
| 521 |
-
t for _, _, t in non_prompt
|
| 522 |
-
)
|
| 523 |
-
|
| 524 |
-
def process_iter(self):
|
| 525 |
-
"""Runs on the current audio buffer.
|
| 526 |
-
Returns: a tuple (beg_timestamp, end_timestamp, "text"), or (None, None, "").
|
| 527 |
-
The non-emty text is confirmed (committed) partial transcript.
|
| 528 |
-
"""
|
| 529 |
-
|
| 530 |
-
prompt, non_prompt = self.prompt()
|
| 531 |
-
logger.debug(f"PROMPT: {prompt}")
|
| 532 |
-
logger.debug(f"CONTEXT: {non_prompt}")
|
| 533 |
-
logger.debug(
|
| 534 |
-
f"transcribing {len(self.audio_buffer)/self.SAMPLING_RATE:2.2f} seconds from {self.buffer_time_offset:2.2f}"
|
| 535 |
-
)
|
| 536 |
-
res = self.asr.transcribe(self.audio_buffer, init_prompt=prompt)
|
| 537 |
-
|
| 538 |
-
# transform to [(beg,end,"word1"), ...]
|
| 539 |
-
tsw = self.asr.ts_words(res)
|
| 540 |
-
|
| 541 |
-
self.transcript_buffer.insert(tsw, self.buffer_time_offset)
|
| 542 |
-
o = self.transcript_buffer.flush()
|
| 543 |
-
self.commited.extend(o)
|
| 544 |
-
completed = self.to_flush(o)
|
| 545 |
-
logger.debug(f">>>>COMPLETE NOW: {completed[2]}")
|
| 546 |
-
the_rest = self.to_flush(self.transcript_buffer.complete())
|
| 547 |
-
logger.debug(f"INCOMPLETE: {the_rest[2]}")
|
| 548 |
-
|
| 549 |
-
# there is a newly confirmed text
|
| 550 |
-
|
| 551 |
-
if o and self.buffer_trimming_way == "sentence": # trim the completed sentences
|
| 552 |
-
if (
|
| 553 |
-
len(self.audio_buffer) / self.SAMPLING_RATE > self.buffer_trimming_sec
|
| 554 |
-
): # longer than this
|
| 555 |
-
self.chunk_completed_sentence()
|
| 556 |
-
|
| 557 |
-
if self.buffer_trimming_way == "segment":
|
| 558 |
-
s = self.buffer_trimming_sec # trim the completed segments longer than s,
|
| 559 |
-
else:
|
| 560 |
-
s = 30 # if the audio buffer is longer than 30s, trim it
|
| 561 |
-
|
| 562 |
-
if len(self.audio_buffer) / self.SAMPLING_RATE > s:
|
| 563 |
-
self.chunk_completed_segment(res)
|
| 564 |
-
|
| 565 |
-
# alternative: on any word
|
| 566 |
-
# l = self.buffer_time_offset + len(self.audio_buffer)/self.SAMPLING_RATE - 10
|
| 567 |
-
# let's find commited word that is less
|
| 568 |
-
# k = len(self.commited)-1
|
| 569 |
-
# while k>0 and self.commited[k][1] > l:
|
| 570 |
-
# k -= 1
|
| 571 |
-
# t = self.commited[k][1]
|
| 572 |
-
logger.debug("chunking segment")
|
| 573 |
-
# self.chunk_at(t)
|
| 574 |
-
|
| 575 |
-
logger.debug(
|
| 576 |
-
f"len of buffer now: {len(self.audio_buffer)/self.SAMPLING_RATE:2.2f}"
|
| 577 |
-
)
|
| 578 |
-
return self.to_flush(o)
|
| 579 |
-
|
| 580 |
-
def chunk_completed_sentence(self):
|
| 581 |
-
if self.commited == []:
|
| 582 |
-
return
|
| 583 |
-
logger.debug("COMPLETED SENTENCE: ", [s[2] for s in self.commited])
|
| 584 |
-
sents = self.words_to_sentences(self.commited)
|
| 585 |
-
for s in sents:
|
| 586 |
-
logger.debug(f"\t\tSENT: {s}")
|
| 587 |
-
if len(sents) < 2:
|
| 588 |
-
return
|
| 589 |
-
while len(sents) > 2:
|
| 590 |
-
sents.pop(0)
|
| 591 |
-
# we will continue with audio processing at this timestamp
|
| 592 |
-
chunk_at = sents[-2][1]
|
| 593 |
-
|
| 594 |
-
logger.debug(f"--- sentence chunked at {chunk_at:2.2f}")
|
| 595 |
-
self.chunk_at(chunk_at)
|
| 596 |
-
|
| 597 |
-
def chunk_completed_segment(self, res):
|
| 598 |
-
if self.commited == []:
|
| 599 |
-
return
|
| 600 |
-
|
| 601 |
-
ends = self.asr.segments_end_ts(res)
|
| 602 |
-
|
| 603 |
-
t = self.commited[-1][1]
|
| 604 |
-
|
| 605 |
-
if len(ends) > 1:
|
| 606 |
-
|
| 607 |
-
e = ends[-2] + self.buffer_time_offset
|
| 608 |
-
while len(ends) > 2 and e > t:
|
| 609 |
-
ends.pop(-1)
|
| 610 |
-
e = ends[-2] + self.buffer_time_offset
|
| 611 |
-
if e <= t:
|
| 612 |
-
logger.debug(f"--- segment chunked at {e:2.2f}")
|
| 613 |
-
self.chunk_at(e)
|
| 614 |
-
else:
|
| 615 |
-
logger.debug(f"--- last segment not within commited area")
|
| 616 |
-
else:
|
| 617 |
-
logger.debug(f"--- not enough segments to chunk")
|
| 618 |
-
|
| 619 |
-
def chunk_at(self, time):
|
| 620 |
-
"""trims the hypothesis and audio buffer at "time" """
|
| 621 |
-
self.transcript_buffer.pop_commited(time)
|
| 622 |
-
cut_seconds = time - self.buffer_time_offset
|
| 623 |
-
self.audio_buffer = self.audio_buffer[int(cut_seconds * self.SAMPLING_RATE) :]
|
| 624 |
-
self.buffer_time_offset = time
|
| 625 |
-
|
| 626 |
-
def words_to_sentences(self, words):
|
| 627 |
-
"""Uses self.tokenize for sentence segmentation of words.
|
| 628 |
-
Returns: [(beg,end,"sentence 1"),...]
|
| 629 |
-
"""
|
| 630 |
-
|
| 631 |
-
cwords = [w for w in words]
|
| 632 |
-
t = " ".join(o[2] for o in cwords)
|
| 633 |
-
s = self.tokenize(t)
|
| 634 |
-
out = []
|
| 635 |
-
while s:
|
| 636 |
-
beg = None
|
| 637 |
-
end = None
|
| 638 |
-
sent = s.pop(0).strip()
|
| 639 |
-
fsent = sent
|
| 640 |
-
while cwords:
|
| 641 |
-
b, e, w = cwords.pop(0)
|
| 642 |
-
w = w.strip()
|
| 643 |
-
if beg is None and sent.startswith(w):
|
| 644 |
-
beg = b
|
| 645 |
-
elif end is None and sent == w:
|
| 646 |
-
end = e
|
| 647 |
-
out.append((beg, end, fsent))
|
| 648 |
-
break
|
| 649 |
-
sent = sent[len(w) :].strip()
|
| 650 |
-
return out
|
| 651 |
-
|
| 652 |
-
def finish(self):
|
| 653 |
-
"""Flush the incomplete text when the whole processing ends.
|
| 654 |
-
Returns: the same format as self.process_iter()
|
| 655 |
-
"""
|
| 656 |
-
o = self.transcript_buffer.complete()
|
| 657 |
-
f = self.to_flush(o)
|
| 658 |
-
logger.debug(f"last, noncommited: {f}")
|
| 659 |
-
self.buffer_time_offset += len(self.audio_buffer) / 16000
|
| 660 |
-
return f
|
| 661 |
-
|
| 662 |
-
def to_flush(
|
| 663 |
-
self,
|
| 664 |
-
sents,
|
| 665 |
-
sep=None,
|
| 666 |
-
offset=0,
|
| 667 |
-
):
|
| 668 |
-
# concatenates the timestamped words or sentences into one sequence that is flushed in one line
|
| 669 |
-
# sents: [(beg1, end1, "sentence1"), ...] or [] if empty
|
| 670 |
-
# return: (beg1,end-of-last-sentence,"concatenation of sentences") or (None, None, "") if empty
|
| 671 |
-
if sep is None:
|
| 672 |
-
sep = self.asr.sep
|
| 673 |
-
t = sep.join(s[2] for s in sents)
|
| 674 |
-
if len(sents) == 0:
|
| 675 |
-
b = None
|
| 676 |
-
e = None
|
| 677 |
-
else:
|
| 678 |
-
b = offset + sents[0][0]
|
| 679 |
-
e = offset + sents[-1][1]
|
| 680 |
-
return (b, e, t)
|
| 681 |
-
|
| 682 |
-
|
| 683 |
-
class VACOnlineASRProcessor(OnlineASRProcessor):
|
| 684 |
-
"""Wraps OnlineASRProcessor with VAC (Voice Activity Controller).
|
| 685 |
-
|
| 686 |
-
It works the same way as OnlineASRProcessor: it receives chunks of audio (e.g. 0.04 seconds),
|
| 687 |
-
it runs VAD and continuously detects whether there is speech or not.
|
| 688 |
-
When it detects end of speech (non-voice for 500ms), it makes OnlineASRProcessor to end the utterance immediately.
|
| 689 |
-
"""
|
| 690 |
-
|
| 691 |
-
def __init__(self, online_chunk_size, *a, **kw):
|
| 692 |
-
self.online_chunk_size = online_chunk_size
|
| 693 |
-
|
| 694 |
-
self.online = OnlineASRProcessor(*a, **kw)
|
| 695 |
-
|
| 696 |
-
# VAC:
|
| 697 |
-
import torch
|
| 698 |
-
|
| 699 |
-
model, _ = torch.hub.load(repo_or_dir="snakers4/silero-vad", model="silero_vad")
|
| 700 |
-
from silero_vad_iterator import FixedVADIterator
|
| 701 |
-
|
| 702 |
-
self.vac = FixedVADIterator(
|
| 703 |
-
model
|
| 704 |
-
) # we use the default options there: 500ms silence, 100ms padding, etc.
|
| 705 |
-
|
| 706 |
-
self.logfile = self.online.logfile
|
| 707 |
-
self.init()
|
| 708 |
-
|
| 709 |
-
def init(self):
|
| 710 |
-
self.online.init()
|
| 711 |
-
self.vac.reset_states()
|
| 712 |
-
self.current_online_chunk_buffer_size = 0
|
| 713 |
-
|
| 714 |
-
self.is_currently_final = False
|
| 715 |
-
|
| 716 |
-
self.status = None # or "voice" or "nonvoice"
|
| 717 |
-
self.audio_buffer = np.array([], dtype=np.float32)
|
| 718 |
-
self.buffer_offset = 0 # in frames
|
| 719 |
-
|
| 720 |
-
def clear_buffer(self):
|
| 721 |
-
self.buffer_offset += len(self.audio_buffer)
|
| 722 |
-
self.audio_buffer = np.array([], dtype=np.float32)
|
| 723 |
-
|
| 724 |
-
def insert_audio_chunk(self, audio):
|
| 725 |
-
res = self.vac(audio)
|
| 726 |
-
self.audio_buffer = np.append(self.audio_buffer, audio)
|
| 727 |
-
|
| 728 |
-
if res is not None:
|
| 729 |
-
frame = list(res.values())[0] - self.buffer_offset
|
| 730 |
-
if "start" in res and "end" not in res:
|
| 731 |
-
self.status = "voice"
|
| 732 |
-
send_audio = self.audio_buffer[frame:]
|
| 733 |
-
self.online.init(
|
| 734 |
-
offset=(frame + self.buffer_offset) / self.SAMPLING_RATE
|
| 735 |
-
)
|
| 736 |
-
self.online.insert_audio_chunk(send_audio)
|
| 737 |
-
self.current_online_chunk_buffer_size += len(send_audio)
|
| 738 |
-
self.clear_buffer()
|
| 739 |
-
elif "end" in res and "start" not in res:
|
| 740 |
-
self.status = "nonvoice"
|
| 741 |
-
send_audio = self.audio_buffer[:frame]
|
| 742 |
-
self.online.insert_audio_chunk(send_audio)
|
| 743 |
-
self.current_online_chunk_buffer_size += len(send_audio)
|
| 744 |
-
self.is_currently_final = True
|
| 745 |
-
self.clear_buffer()
|
| 746 |
-
else:
|
| 747 |
-
beg = res["start"] - self.buffer_offset
|
| 748 |
-
end = res["end"] - self.buffer_offset
|
| 749 |
-
self.status = "nonvoice"
|
| 750 |
-
send_audio = self.audio_buffer[beg:end]
|
| 751 |
-
self.online.init(offset=(beg + self.buffer_offset) / self.SAMPLING_RATE)
|
| 752 |
-
self.online.insert_audio_chunk(send_audio)
|
| 753 |
-
self.current_online_chunk_buffer_size += len(send_audio)
|
| 754 |
-
self.is_currently_final = True
|
| 755 |
-
self.clear_buffer()
|
| 756 |
-
else:
|
| 757 |
-
if self.status == "voice":
|
| 758 |
-
self.online.insert_audio_chunk(self.audio_buffer)
|
| 759 |
-
self.current_online_chunk_buffer_size += len(self.audio_buffer)
|
| 760 |
-
self.clear_buffer()
|
| 761 |
-
else:
|
| 762 |
-
# We keep 1 second because VAD may later find start of voice in it.
|
| 763 |
-
# But we trim it to prevent OOM.
|
| 764 |
-
self.buffer_offset += max(
|
| 765 |
-
0, len(self.audio_buffer) - self.SAMPLING_RATE
|
| 766 |
-
)
|
| 767 |
-
self.audio_buffer = self.audio_buffer[-self.SAMPLING_RATE :]
|
| 768 |
-
|
| 769 |
-
def process_iter(self):
|
| 770 |
-
if self.is_currently_final:
|
| 771 |
-
return self.finish()
|
| 772 |
-
elif (
|
| 773 |
-
self.current_online_chunk_buffer_size
|
| 774 |
-
> self.SAMPLING_RATE * self.online_chunk_size
|
| 775 |
-
):
|
| 776 |
-
self.current_online_chunk_buffer_size = 0
|
| 777 |
-
ret = self.online.process_iter()
|
| 778 |
-
return ret
|
| 779 |
-
else:
|
| 780 |
-
print("no online update, only VAD", self.status, file=self.logfile)
|
| 781 |
-
return (None, None, "")
|
| 782 |
-
|
| 783 |
-
def finish(self):
|
| 784 |
-
ret = self.online.finish()
|
| 785 |
-
self.current_online_chunk_buffer_size = 0
|
| 786 |
-
self.is_currently_final = False
|
| 787 |
-
return ret
|
| 788 |
-
|
| 789 |
-
|
| 790 |
WHISPER_LANG_CODES = "af,am,ar,as,az,ba,be,bg,bn,bo,br,bs,ca,cs,cy,da,de,el,en,es,et,eu,fa,fi,fo,fr,gl,gu,ha,haw,he,hi,hr,ht,hu,hy,id,is,it,ja,jw,ka,kk,km,kn,ko,la,lb,ln,lo,lt,lv,mg,mi,mk,ml,mn,mr,ms,mt,my,ne,nl,nn,no,oc,pa,pl,ps,pt,ro,ru,sa,sd,si,sk,sl,sn,so,sq,sr,su,sv,sw,ta,te,tg,th,tk,tl,tr,tt,uk,ur,uz,vi,yi,yo,zh".split(
|
| 791 |
","
|
| 792 |
)
|
|
@@ -852,7 +88,7 @@ def add_shared_args(parser):
|
|
| 852 |
parser.add_argument(
|
| 853 |
"--model",
|
| 854 |
type=str,
|
| 855 |
-
default="
|
| 856 |
choices="tiny.en,tiny,base.en,base,small.en,small,medium.en,medium,large-v1,large-v2,large-v3,large,large-v3-turbo".split(
|
| 857 |
","
|
| 858 |
),
|
|
@@ -887,14 +123,14 @@ def add_shared_args(parser):
|
|
| 887 |
parser.add_argument(
|
| 888 |
"--backend",
|
| 889 |
type=str,
|
| 890 |
-
default="
|
| 891 |
choices=["faster-whisper", "whisper_timestamped", "mlx-whisper", "openai-api"],
|
| 892 |
help="Load only this backend for Whisper processing.",
|
| 893 |
)
|
| 894 |
parser.add_argument(
|
| 895 |
"--vac",
|
| 896 |
action="store_true",
|
| 897 |
-
default=
|
| 898 |
help="Use VAC = voice activity controller. Recommended. Requires torch.",
|
| 899 |
)
|
| 900 |
parser.add_argument(
|
|
@@ -903,7 +139,7 @@ def add_shared_args(parser):
|
|
| 903 |
parser.add_argument(
|
| 904 |
"--vad",
|
| 905 |
action="store_true",
|
| 906 |
-
default=
|
| 907 |
help="Use VAD = voice activity detection, with the default parameters.",
|
| 908 |
)
|
| 909 |
parser.add_argument(
|
|
|
|
| 5 |
from functools import lru_cache
|
| 6 |
import time
|
| 7 |
import logging
|
| 8 |
+
from src.whisper_streaming.backends import FasterWhisperASR, MLXWhisper, WhisperTimestampedASR, OpenaiApiASR
|
| 9 |
+
from src.whisper_streaming.online_asr import OnlineASRProcessor, VACOnlineASRProcessor
|
|
|
|
|
|
|
| 10 |
|
| 11 |
logger = logging.getLogger(__name__)
|
| 12 |
|
|
|
|
| 23 |
end_s = int(end * 16000)
|
| 24 |
return audio[beg_s:end_s]
|
| 25 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
WHISPER_LANG_CODES = "af,am,ar,as,az,ba,be,bg,bn,bo,br,bs,ca,cs,cy,da,de,el,en,es,et,eu,fa,fi,fo,fr,gl,gu,ha,haw,he,hi,hr,ht,hu,hy,id,is,it,ja,jw,ka,kk,km,kn,ko,la,lb,ln,lo,lt,lv,mg,mi,mk,ml,mn,mr,ms,mt,my,ne,nl,nn,no,oc,pa,pl,ps,pt,ro,ru,sa,sd,si,sk,sl,sn,so,sq,sr,su,sv,sw,ta,te,tg,th,tk,tl,tr,tt,uk,ur,uz,vi,yi,yo,zh".split(
|
| 27 |
","
|
| 28 |
)
|
|
|
|
| 88 |
parser.add_argument(
|
| 89 |
"--model",
|
| 90 |
type=str,
|
| 91 |
+
default="large-v3-turbo",
|
| 92 |
choices="tiny.en,tiny,base.en,base,small.en,small,medium.en,medium,large-v1,large-v2,large-v3,large,large-v3-turbo".split(
|
| 93 |
","
|
| 94 |
),
|
|
|
|
| 123 |
parser.add_argument(
|
| 124 |
"--backend",
|
| 125 |
type=str,
|
| 126 |
+
default="faster-whisper",
|
| 127 |
choices=["faster-whisper", "whisper_timestamped", "mlx-whisper", "openai-api"],
|
| 128 |
help="Load only this backend for Whisper processing.",
|
| 129 |
)
|
| 130 |
parser.add_argument(
|
| 131 |
"--vac",
|
| 132 |
action="store_true",
|
| 133 |
+
default=False,
|
| 134 |
help="Use VAC = voice activity controller. Recommended. Requires torch.",
|
| 135 |
)
|
| 136 |
parser.add_argument(
|
|
|
|
| 139 |
parser.add_argument(
|
| 140 |
"--vad",
|
| 141 |
action="store_true",
|
| 142 |
+
default=False,
|
| 143 |
help="Use VAD = voice activity detection, with the default parameters.",
|
| 144 |
)
|
| 145 |
parser.add_argument(
|