Spaces:
Running
Running
import json | |
import librosa | |
import numpy as np | |
from tqdm import tqdm | |
from metrics.FD import ASTaudio2feature, calculate_statistics, save_AST_feature | |
from tools import rms_normalize | |
from transformers import AutoProcessor, ASTModel | |
device = "cpu" | |
processor = AutoProcessor.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593") | |
AST = ASTModel.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593").to(device) | |
data_split = "train" | |
with open(f'data/NSynth/{data_split}_examples.json') as f: | |
data = json.load(f) | |
def read_signal(note_str): | |
y, sr = librosa.load(f"data/NSynth/nsynth-{data_split}-52/audio/{note_str}.wav", sr=16000) | |
if len(y) >= 64000: | |
y = y[:64000] | |
else: | |
y_extend = [0.0] * 64000 | |
y_extend[:len(y)] = y | |
y = y_extend | |
return rms_normalize(y) | |
for quality in ["bright", "dark", "distortion", "fast_decay", "long_release", "multiphonic", "nonlinear_env", "percussive", "reverb", "tempo-synced"]: | |
features = [] | |
for i, (note_str, attributes) in tqdm(enumerate(data.items())): | |
if not attributes["pitch"] == 52: | |
continue | |
if not (quality in attributes['qualities_str']): | |
continue | |
signal = read_signal(note_str) | |
feature_for_one_signal = ASTaudio2feature(device, [signal], processor, AST, sampling_rate=16000)[0] | |
features.append(feature_for_one_signal) | |
mu, sigma = calculate_statistics(features) | |
print(np.shape(mu)) | |
print(np.shape(sigma)) | |
save_AST_feature(f'{data_split}_{quality}', mu.tolist(), sigma.tolist()) | |
for instrument_name in ["bass", "brass", "flute", "guitar", "keyboard", "mallet", "organ", "reed", "string", "synth_lead", "vocal"]: | |
features = [] | |
for i, (note_str, attributes) in tqdm(enumerate(data.items())): | |
if not attributes["pitch"] == 52: | |
continue | |
if not (attributes["instrument_family_str"] == instrument_name): | |
continue | |
signal = read_signal(note_str) | |
feature_for_one_signal = ASTaudio2feature(device, [signal], processor, AST, sampling_rate=16000)[0] | |
features.append(feature_for_one_signal) | |
mu, sigma = calculate_statistics(features) | |
print(np.shape(mu)) | |
print(np.shape(sigma)) | |
save_AST_feature(f'{data_split}_{instrument_name}', mu.tolist(), sigma.tolist()) |