Spaces:

kotoba-speech
/

kotoba-whisper-diarization-demo

Paused

App Files Files Community

asahi417 commited on Oct 22, 2024

Commit

b683426

1 Parent(s): dbdfadb

init

Browse files

Files changed (1) hide show

app.py +56 -32

app.py CHANGED Viewed

@@ -5,35 +5,24 @@ from typing import Optional
 import spaces
 import torch
 import gradio as gr
-import numpy as np
 from transformers import pipeline
 from transformers.pipelines.audio_utils import ffmpeg_read
 # config
 model_name = "kotoba-tech/kotoba-whisper-v2.2"
 example_file = "sample_diarization_japanese.mp3"
-# device setting
 if torch.cuda.is_available():
-    torch_dtype = torch.bfloat16
-    device = "cuda"
-    model_kwargs = {'attn_implementation': 'sdpa'}
 else:
-    torch_dtype = torch.float32
-    device = "cpu"
-    model_kwargs = {}
-# define the pipeline
-pipe = pipeline(
-    model=model_name,
-    chunk_length_s=15,
-    batch_size=16,
-    torch_dtype=torch_dtype,
-    device=device,
-    model_kwargs=model_kwargs,
-    trust_remote_code=True
-)
-sampling_rate = pipe.feature_extractor.sampling_rate
 def format_time(start: Optional[float], end: Optional[float]):
@@ -52,23 +41,35 @@ def format_time(start: Optional[float], end: Optional[float]):
 @spaces.GPU
-def get_prediction(inputs):
-    return pipe(inputs, generate_kwargs={"language": "ja", "task": "transcribe"})
-def transcribe(inputs: str):
     if inputs is None:
         raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
     with open(inputs, "rb") as f:
         inputs = f.read()
-    inputs = ffmpeg_read(inputs, sampling_rate)
-    array_pad = np.zeros(int(pipe.feature_extractor.sampling_rate * 0.5))
-    inputs = np.concatenate([array_pad, inputs, array_pad])
-    prediction = get_prediction({"array": inputs, "sampling_rate": sampling_rate})
     output = ""
-    for n, s in enumerate(prediction["speakers"]):
         text_timestamped = "\n".join([f"- **{format_time(*c['timestamp'])}** {c['text']}" for c in prediction[f"chunks/{s}"]])
-        output += f'### Speaker {n+1} \n{text_timestamped}\n'
     return output
@@ -78,11 +79,34 @@ title = f"Audio Transcription and Diarization with {os.path.basename(model_name)
 shared_config = {"fn": transcribe, "title": title, "description": description, "allow_flagging": "never", "examples": [example_file]}
 o_upload = gr.Markdown()
 o_mic = gr.Markdown()
 i_upload = gr.Interface(
-    inputs=[gr.Audio(sources="upload", type="filepath", label="Audio file")], outputs=gr.Markdown(), **shared_config
 )
 i_mic = gr.Interface(
-    inputs=[gr.Audio(sources="microphone", type="filepath", label="Microphone input")], outputs=gr.Markdown(), **shared_config
 )
 with gr.Blocks() as demo:
     gr.TabbedInterface([i_upload, i_mic], ["Audio file", "Microphone"])

 import spaces
 import torch
 import gradio as gr
 from transformers import pipeline
 from transformers.pipelines.audio_utils import ffmpeg_read
 # config
 model_name = "kotoba-tech/kotoba-whisper-v2.2"
 example_file = "sample_diarization_japanese.mp3"
 if torch.cuda.is_available():
+    pipe = pipeline(
+        model=model_name,
+        chunk_length_s=15,
+        batch_size=16,
+        torch_dtype=torch.bfloat16,
+        device="cuda",
+        model_kwargs={'attn_implementation': 'sdpa'},
+        trust_remote_code=True
+    )
 else:
+    pipe = pipeline(model=model_name, chunk_length_s=15, batch_size=16, trust_remote_code=True)
 def format_time(start: Optional[float], end: Optional[float]):
 @spaces.GPU
+def get_prediction(inputs, **kwargs):
+    return pipe(inputs, **kwargs)
+def transcribe(inputs: str,
+               add_punctuation: bool,
+               num_speakers: Optional[float],
+               min_speakers: Optional[float],
+               max_speakers: Optional[float],
+               add_silence_end: Optional[float],
+               add_silence_start: Optional[float]):
     if inputs is None:
         raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
     with open(inputs, "rb") as f:
         inputs = f.read()
+    array = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
+    prediction = get_prediction(
+        inputs={"array": array, "sampling_rate": pipe.feature_extractor.sampling_rate},
+        add_punctuation=add_punctuation,
+        num_speakers=int(num_speakers) if num_speakers != 0 else None,
+        min_speakers=int(min_speakers) if min_speakers != 0 else None,
+        max_speakers=int(max_speakers) if max_speakers != 0 else None,
+        add_silence_end=add_silence_end if add_silence_end != 0 else None,
+        add_silence_start=add_silence_start if add_silence_start != 0 else None
+    )
     output = ""
+    for n, s in enumerate(prediction["speaker_ids"]):
         text_timestamped = "\n".join([f"- **{format_time(*c['timestamp'])}** {c['text']}" for c in prediction[f"chunks/{s}"]])
+        output += f'### Speaker {n+1} \n{prediction[f"text/{s}"]}\n\n{text_timestamped}\n'
     return output
 shared_config = {"fn": transcribe, "title": title, "description": description, "allow_flagging": "never", "examples": [example_file]}
 o_upload = gr.Markdown()
 o_mic = gr.Markdown()
+options = [
+]
 i_upload = gr.Interface(
+    inputs=[
+        gr.Audio(sources="upload", type="filepath", label="Audio file"),
+        gr.Checkbox(label="add punctuation", value=True),
+        gr.Slider(0, 10, label="num speakers (set 0 for auto-detect mode)", value=0, step=1),
+        gr.Slider(0, 10, label="min speakers (set 0 for auto-detect mode)", value=0, step=1),
+        gr.Slider(0, 10, label="max speakers (set 0 for auto-detect mode)", value=0, step=1),
+        gr.Slider(0, 0.5, label="silence at the end", value=0.5, step=0.05),
+        gr.Slider(0, 0.5, label="silence at the start", value=0.5, step=0.05),
+    ],
+    outputs=gr.Markdown(),
+    **shared_config
 )
 i_mic = gr.Interface(
+    inputs=[
+        gr.Audio(sources="microphone", type="filepath", label="Microphone input"),
+        gr.Checkbox(label="add punctuation", value=True),
+        gr.Slider(0, 10, label="num speakers (set 0 for auto-detect mode)", value=0, step=1),
+        gr.Slider(0, 10, label="min speakers (set 0 for auto-detect mode)", value=0, step=1),
+        gr.Slider(0, 10, label="max speakers (set 0 for auto-detect mode)", value=0, step=1),
+        gr.Slider(0, 0.5, label="silence at the end", value=0.5, step=0.05),
+        gr.Slider(0, 0.5, label="silence at the start", value=0.5, step=0.05),
+    ],
+    outputs=gr.Markdown(),
+    **shared_config
 )
 with gr.Blocks() as demo:
     gr.TabbedInterface([i_upload, i_mic], ["Audio file", "Microphone"])