In [1]:
! pip install -Uqq fastai gradio torchaudio torchvision huggingface_hub

[K     |████████████████████████████████| 197 kB 24.3 MB/s 
[K     |████████████████████████████████| 2.9 MB 52.4 MB/s 
[K     |████████████████████████████████| 77 kB 5.4 MB/s 
[K     |████████████████████████████████| 60 kB 6.1 MB/s 
[K     |████████████████████████████████| 84 kB 3.1 MB/s 
[K     |████████████████████████████████| 54 kB 2.7 MB/s 
[K     |████████████████████████████████| 1.1 MB 40.9 MB/s 
[K     |████████████████████████████████| 2.0 MB 42.1 MB/s 
[K     |████████████████████████████████| 253 kB 54.9 MB/s 
[K     |████████████████████████████████| 53 kB 2.1 MB/s 
[K     |████████████████████████████████| 212 kB 44.2 MB/s 
[K     |████████████████████████████████| 271 kB 64.4 MB/s 
[K     |████████████████████████████████| 144 kB 45.5 MB/s 
[K     |████████████████████████████████| 94 kB 3.1 MB/s 
[K     |████████████████████████████████| 58 kB 6.2 MB/s 
[K     |████████████████████████████████| 10.9 MB 53.7 MB/s 
[K     |████████████████████████████

In [2]:
import gradio
import torchaudio
from fastai.vision.all import *
from fastai.learner import load_learner
from torchvision.utils import save_image
from huggingface_hub import hf_hub_download

  "class": algorithms.Blowfish,


In [3]:
model = load_learner(
    hf_hub_download("kurianbenoy/music_genre_classification_baseline", "model.pkl")
)

labels = model.dls.vocab

Downloading:   0%|          | 0.00/103M [00:00<?, ?B/s]

In [8]:
N_FFT = 2048
HOP_LEN = 1024


def create_spectrogram(filename):
    audio, sr = torchaudio.load(filename)
    specgram = torchaudio.transforms.MelSpectrogram(
        sample_rate=sr,
        n_fft=N_FFT,
        win_length=N_FFT,
        hop_length=HOP_LEN,
        center=True,
        pad_mode="reflect",
        power=2.0,
        norm="slaney",
        onesided=True,
        n_mels=224,
        mel_scale="htk",
    )(audio).mean(axis=0)
    specgram = torchaudio.transforms.AmplitudeToDB()(specgram)
    specgram = specgram - specgram.min()
    specgram = specgram / specgram.max()

    return specgram

In [11]:
def create_image(filename):
    specgram = create_spectrogram(filename)
    dest = Path("temp.png")
    save_image(specgram, "temp.png")

In [4]:
def predict(img):
    img = PILImage.create(img)
    _pred, _pred_w_idx, probs = model.predict(img)
    labels_probs = {labels[i]: float(probs[i]) for i, _ in enumerate(labels)}
    return labels_probs

In [12]:
def end2endpipeline(filename):
    create_image(filename)
    return predict("temp.png")

In [13]:
interface_options = {
    "title": "Music Genre Classification",
    "description": "A simple baseline model for classifying music genres with fast.ai on [Kaggle competition data](https://www.kaggle.com/competitions/kaggle-pog-series-s01e02/data)",
    "interpretation": "default",
    "layout": "horizontal",
    "theme": "default",
}

demo = gradio.Interface(
    fn=end2endpipeline,
    inputs=gradio.inputs.Audio(source="microphone", type="filepath", label="Record/ Drop audio"),
    outputs=gradio.outputs.Label(num_top_classes=5),
    **interface_options,
)

launch_options = {
    "enable_queue": True,
    "share": False,
}
demo.launch(**launch_options)

Colab notebook detected. To show errors in colab notebook, set `debug=True` in `launch()`
Your interface requires microphone or webcam permissions - this may cause issues in Colab. Use the External URL in case of issues.
Running on public URL: https://55955.gradio.app

This share link expires in 72 hours. For free permanent hosting, check out Spaces (https://huggingface.co/spaces)


(<fastapi.applications.FastAPI at 0x7f65bf011ed0>,
 'http://127.0.0.1:7862/',
 'https://55955.gradio.app')

In [9]:
input_audio = [gradio.inputs.Audio(source="microphone", type="filepath", label="Record/ Drop audio")]
input_audio

[Audio(label="Record/ Drop audio")]