Spaces:

benpry
/

podcast-transcription

Sleeping

App Files Files Community

Ben Prystawski commited on Dec 29, 2023

Commit

64425f4

1 Parent(s): f22f4f8

Added implementation

Browse files

Files changed (1) hide show

app.py +106 -0

app.py ADDED Viewed

	@@ -0,0 +1,106 @@

+"""
+A Gradio app to transcribe and diarize a podcast using Whisper and pyannote. Adapted from Dwarkesh Patel's Colab notebook here:
+https://colab.research.google.com/drive/1V-Bt5Hm2kjaDb4P1RyMSswsDKyrzc2-3?usp=sharing
+"""
+import whisper
+import datetime
+import subprocess
+import torch
+import gradio as gr
+import pyannote.audio
+from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
+from pyannote.audio import Audio
+from pyannote.core import Segment
+import wave
+import contextlib
+from sklearn.cluster import AgglomerativeClustering
+import numpy as np
+embedding_model = PretrainedSpeakerEmbedding(
+    "speechbrain/spkrec-ecapa-voxceleb", device=torch.device("mps")
+)
+audio = Audio()
+def time(secs):
+    return datetime.timedelta(seconds=round(secs))
+def segment_embedding(segment, duration, audio, path):
+    start = segment["start"]
+    # Whisper overshoots the end timestamp in the last segment
+    end = min(duration, segment["end"])
+    clip = Segment(start, end)
+    waveform, sample_rate = audio.crop(path, clip)
+    return embedding_model(waveform[None])
+def get_whisper_results(path, model_type):
+    model = whisper.load_model(model_type)
+    result = model.transcribe(path)
+    segments = result["segments"]
+    with contextlib.closing(wave.open(path, "r")) as f:
+        frames = f.getnframes()
+        rate = f.getframerate()
+        duration = frames / float(rate)
+    return result, segments, frames, rate, duration
+def cluster_embeddings(segments, duration, path, num_speakers):
+    embeddings = np.zeros(shape=(len(segments), 192))
+    for i, segment in enumerate(segments):
+        embeddings[i] = segment_embedding(segment, duration, audio, path)
+    embeddings = np.nan_to_num(embeddings)
+    print(f"num speakers: {num_speakers}")
+    clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
+    labels = clustering.labels_
+    for i in range(len(segments)):
+        segments[i]["speaker"] = "SPEAKER " + str(labels[i] + 1)
+def transcribe(path, model_type, num_speakers):
+    if path[-3:] != "wav":
+        subprocess.call(["ffmpeg", "-i", path, "audio.wav", "-y"])
+        path = "audio.wav"
+    ret = ""
+    result, segments, frames, rate, duration = get_whisper_results(path, model_type)
+    cluster_embeddings(segments, duration, path, num_speakers)
+    for i, segment in enumerate(segments):
+        if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
+            ret += "\n" + segment["speaker"] + " " + str(time(segment["start"])) + "\n"
+        ret += segment["text"][1:] + " "
+    return ret
+if __name__ == "__main__":
+    interface = gr.Interface(
+        fn=transcribe,
+        inputs=[
+            gr.File(file_count="single", label="Upload an audio file"),
+            gr.Radio(
+                choices=["tiny", "base", "small", "medium", "large-v3"],
+                value="large-v3",
+                type="value",
+                label="Model size",
+            ),
+            gr.Number(
+                value=2,
+                label="Number of speakers",
+            ),
+        ],
+        outputs=gr.Textbox(label="Transcript", show_copy_button=True),
+        title="Transcribe a podcast!",
+        description="Upload an audio file and choose a model size and number of speakers on the left, then click submit to transcribe!",
+        theme=gr.themes.Soft(),
+    )
+    interface.launch()