Spaces:

cchaun
/

musical_instrument_identification

Build error

App Files Files Community

cchuan commited on Aug 16, 2022

Commit

236a9d1

1 Parent(s): cdcd86e

first commit

Browse files

Files changed (4) hide show

app.py +56 -0
data_setups.py +80 -0
requirements.txt +5 -0
torch_efficientnet_b0_fold4.pth +3 -0

app.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import gradio as gr
+import torch, torchaudio
+from timeit import default_timer as timer
+from data_setups import audio_preprocess, resample
+device = "cuda" if torch.cuda.is_available() else "cpu"
+SAMPLE_RATE = 44100
+AUDIO_LEN = 2.90
+model = torch.load("torch_efficientnet_b0_fold4.pth", map_location=torch.device('cpu'))
+CHINESE_LABELS = [
+    "大提琴", "單簧管", "長笛", "民謠吉他", "電吉他", "風琴", "鋼琴", "薩克斯風", "喇叭", "小提琴", "人聲"
+]
+example_list = [
+    "samples/guitar_acoustic.wav",
+    "samples/piano.wav",
+    "samples/guitar_electric.wav"
+]
+def predict(audio_path):
+    start_time = timer()
+    wavform, sample_rate = torchaudio.load(audio_path)
+    wav = resample(wavform, sample_rate, SAMPLE_RATE)
+    if len(wav) > int(AUDIO_LEN * SAMPLE_RATE):
+        wav = wav[:int(AUDIO_LEN * SAMPLE_RATE)]
+    else:
+        print(f"input length {len(wav)} too small!, need over {int(AUDIO_LEN * SAMPLE_RATE)}")
+        return
+    # input Preprocessing
+    img = audio_preprocess(wav, SAMPLE_RATE).unsqueeze(0)
+    print(img.shape)
+    model.eval()
+    with torch.inference_mode():
+        pred_probs = torch.softmax(model(img), dim=1)
+    pred_labels_and_probs = {CHINESE_LABELS[i]: float(pred_probs[0][i]) for i in range(len(CHINESE_LABELS))}
+    pred_time = round(timer() - start_time, 5)
+    return pred_labels_and_probs, pred_time
+title = "Musical Instrument Classification 🎺🎸🎹🎻"
+description = "An EfficientNetB0 feature extractor model to classify 11 different musical instruments"
+article = ""
+demo = gr.Interface(fn=predict,
+                    inputs=gr.Audio(type="filepath"),
+                    outputs=[gr.Label(num_top_classes=11, label="Predictions"),
+                             gr.Number(label="Prediction time (s)")],
+                    examples=example_list,
+                    title=title,
+                    description=description,
+                    article=article)
+demo.launch(debug=False)

data_setups.py ADDED Viewed

	@@ -0,0 +1,80 @@

+# Make function to find classes in target directory
+import os
+import librosa
+import torch
+import numpy as np
+from torchaudio.transforms import Resample
+SAMPLE_RATE = 44100
+AUDIO_LEN = 2.90
+# Parameters to control the MelSpec generation
+N_MELS = 128
+F_MIN = 20
+F_MAX = 16000
+N_FFT = 1024
+HOP_LEN = 512
+# Make function to find classes in target directory
+def find_classes(directory: str):
+    # 1. Get the class names by scanning the target directory
+    classes = sorted(entry.name for entry in os.scandir(directory) if entry.is_dir())
+    # 2. Raise an error if class names not found
+    if not classes:
+        raise FileNotFoundError(f"Couldn't find any classes in {directory}.")
+    # 3. Crearte a dictionary of index labels (computers prefer numerical rather than string labels)
+    class_to_idx = {cls_name: i for i, cls_name in enumerate(classes)}
+    return classes, class_to_idx
+def resample(wav, sample_rate, new_sample_rate):
+    if wav.shape[0] >= 2:
+        wav = torch.mean(wav, dim=0)
+    else:
+        wav = wav.squeeze(0)
+    if sample_rate > new_sample_rate:
+        resampler = Resample(sample_rate, new_sample_rate)
+        wav = resampler(wav)
+    return wav
+def mono_to_color(X, eps=1e-6, mean=None, std=None):
+    X = np.stack([X, X, X], axis=-1)
+    # Standardize
+    mean = mean or X.mean()
+    std = std or X.std()
+    X = (X - mean) / (std + eps)
+    # Normalize to [0, 255]
+    _min, _max = X.min(), X.max()
+    if (_max - _min) > eps:
+        V = np.clip(X, _min, _max)
+        V = 255 * (V - _min) / (_max - _min)
+        V = V.astype(np.uint8)
+    else:
+        V = np.zeros_like(X, dtype=np.uint8)
+    return V
+def normalize(image, mean=None, std=None):
+    image = image / 255.0
+    if mean is not None and std is not None:
+        image = (image - mean) / std
+    return np.moveaxis(image, 2, 0).astype(np.float32)
+def compute_melspec(wav, sample_rate=SAMPLE_RATE):
+    melspec = librosa.feature.melspectrogram(
+        y=wav,
+        sr=sample_rate,
+        n_fft=N_FFT,
+        fmin=F_MIN,
+        fmax=F_MAX,
+        n_mels=N_MELS,
+        hop_length=HOP_LEN
+    )
+    melspec = librosa.power_to_db(melspec).astype(np.float32)
+    return melspec
+def audio_preprocess(wav, sample_rate):
+    wav = wav.numpy()
+    melspec = compute_melspec(wav, sample_rate)
+    image = mono_to_color(melspec)
+    image = normalize(image, mean=None, std=None)
+    image = torch.from_numpy(image)
+    return image

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+torch==1.12.0
+torchvision==0.13.0
+torchaudio==0.12.1
+gradio==3.1.4
+librosa==0.9.2

torch_efficientnet_b0_fold4.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2083c006db34451c229a820aae94091ca7417856b8601da008b20048ccebfb2e
+size 16419889