Spaces:
Runtime error
Runtime error
Ben Prystawski
commited on
Commit
·
64425f4
1
Parent(s):
f22f4f8
Added implementation
Browse files
app.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
A Gradio app to transcribe and diarize a podcast using Whisper and pyannote. Adapted from Dwarkesh Patel's Colab notebook here:
|
| 3 |
+
https://colab.research.google.com/drive/1V-Bt5Hm2kjaDb4P1RyMSswsDKyrzc2-3?usp=sharing
|
| 4 |
+
"""
|
| 5 |
+
import whisper
|
| 6 |
+
import datetime
|
| 7 |
+
|
| 8 |
+
import subprocess
|
| 9 |
+
import torch
|
| 10 |
+
import gradio as gr
|
| 11 |
+
import pyannote.audio
|
| 12 |
+
from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
|
| 13 |
+
from pyannote.audio import Audio
|
| 14 |
+
from pyannote.core import Segment
|
| 15 |
+
import wave
|
| 16 |
+
import contextlib
|
| 17 |
+
|
| 18 |
+
from sklearn.cluster import AgglomerativeClustering
|
| 19 |
+
import numpy as np
|
| 20 |
+
|
| 21 |
+
embedding_model = PretrainedSpeakerEmbedding(
|
| 22 |
+
"speechbrain/spkrec-ecapa-voxceleb", device=torch.device("mps")
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
audio = Audio()
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def time(secs):
|
| 29 |
+
return datetime.timedelta(seconds=round(secs))
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def segment_embedding(segment, duration, audio, path):
|
| 33 |
+
start = segment["start"]
|
| 34 |
+
# Whisper overshoots the end timestamp in the last segment
|
| 35 |
+
end = min(duration, segment["end"])
|
| 36 |
+
clip = Segment(start, end)
|
| 37 |
+
waveform, sample_rate = audio.crop(path, clip)
|
| 38 |
+
return embedding_model(waveform[None])
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def get_whisper_results(path, model_type):
|
| 42 |
+
model = whisper.load_model(model_type)
|
| 43 |
+
result = model.transcribe(path)
|
| 44 |
+
segments = result["segments"]
|
| 45 |
+
|
| 46 |
+
with contextlib.closing(wave.open(path, "r")) as f:
|
| 47 |
+
frames = f.getnframes()
|
| 48 |
+
rate = f.getframerate()
|
| 49 |
+
duration = frames / float(rate)
|
| 50 |
+
|
| 51 |
+
return result, segments, frames, rate, duration
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def cluster_embeddings(segments, duration, path, num_speakers):
|
| 55 |
+
embeddings = np.zeros(shape=(len(segments), 192))
|
| 56 |
+
for i, segment in enumerate(segments):
|
| 57 |
+
embeddings[i] = segment_embedding(segment, duration, audio, path)
|
| 58 |
+
|
| 59 |
+
embeddings = np.nan_to_num(embeddings)
|
| 60 |
+
|
| 61 |
+
print(f"num speakers: {num_speakers}")
|
| 62 |
+
clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
|
| 63 |
+
labels = clustering.labels_
|
| 64 |
+
for i in range(len(segments)):
|
| 65 |
+
segments[i]["speaker"] = "SPEAKER " + str(labels[i] + 1)
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
def transcribe(path, model_type, num_speakers):
|
| 69 |
+
if path[-3:] != "wav":
|
| 70 |
+
subprocess.call(["ffmpeg", "-i", path, "audio.wav", "-y"])
|
| 71 |
+
path = "audio.wav"
|
| 72 |
+
|
| 73 |
+
ret = ""
|
| 74 |
+
result, segments, frames, rate, duration = get_whisper_results(path, model_type)
|
| 75 |
+
cluster_embeddings(segments, duration, path, num_speakers)
|
| 76 |
+
|
| 77 |
+
for i, segment in enumerate(segments):
|
| 78 |
+
if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
|
| 79 |
+
ret += "\n" + segment["speaker"] + " " + str(time(segment["start"])) + "\n"
|
| 80 |
+
ret += segment["text"][1:] + " "
|
| 81 |
+
|
| 82 |
+
return ret
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
if __name__ == "__main__":
|
| 86 |
+
interface = gr.Interface(
|
| 87 |
+
fn=transcribe,
|
| 88 |
+
inputs=[
|
| 89 |
+
gr.File(file_count="single", label="Upload an audio file"),
|
| 90 |
+
gr.Radio(
|
| 91 |
+
choices=["tiny", "base", "small", "medium", "large-v3"],
|
| 92 |
+
value="large-v3",
|
| 93 |
+
type="value",
|
| 94 |
+
label="Model size",
|
| 95 |
+
),
|
| 96 |
+
gr.Number(
|
| 97 |
+
value=2,
|
| 98 |
+
label="Number of speakers",
|
| 99 |
+
),
|
| 100 |
+
],
|
| 101 |
+
outputs=gr.Textbox(label="Transcript", show_copy_button=True),
|
| 102 |
+
title="Transcribe a podcast!",
|
| 103 |
+
description="Upload an audio file and choose a model size and number of speakers on the left, then click submit to transcribe!",
|
| 104 |
+
theme=gr.themes.Soft(),
|
| 105 |
+
)
|
| 106 |
+
interface.launch()
|