Spaces:
Running
Running
import json | |
import os | |
import librosa | |
import numpy as np | |
import torch | |
from tqdm import tqdm | |
from scipy.linalg import sqrtm | |
from metrics.pipelines import sample_pipeline, sample_pipeline_GAN | |
from metrics.pipelines_STFT import sample_pipeline_STFT, sample_pipeline_GAN_STFT | |
from tools import rms_normalize | |
def ASTaudio2feature(device, signal, processor, AST, sampling_rate): | |
# audio file is decoded on the fly | |
inputs = processor(signal, sampling_rate=sampling_rate, return_tensors="pt").to(device) | |
with torch.no_grad(): | |
outputs = AST(**inputs) | |
last_hidden_states = outputs.last_hidden_state[:, 0, :].to("cpu").detach().numpy() | |
return last_hidden_states | |
# 计算两个numpy数组的均值和协方差矩阵 | |
def calculate_statistics(features): | |
mu = np.mean(features, axis=0) | |
sigma = np.cov(features, rowvar=False) | |
return mu, sigma | |
# 计算FID | |
def calculate_fid(mu1, sigma1, mu2, sigma2, eps=1e-6): | |
# 在协方差矩阵对角线上添加一个小的正值 | |
sigma1 += np.eye(sigma1.shape[0]) * eps | |
sigma2 += np.eye(sigma2.shape[0]) * eps | |
ssdiff = np.sum((mu1 - mu2) ** 2.0) | |
covmean = sqrtm(sigma1.dot(sigma2)) | |
# 由于数值问题,有时可能会得到复数,只取实部 | |
if np.iscomplexobj(covmean): | |
covmean = covmean.real | |
fid = ssdiff + np.trace(sigma1 + sigma2 - 2.0 * covmean) | |
return fid | |
# 计算FID | |
def calculate_fid_dict(dict1, dict2, eps=1e-6): | |
# 在协方差矩阵对角线上添加一个小的正值 | |
mu1, sigma1 = dict1["mu"], dict1["sigma"] | |
mu2, sigma2 = dict2["mu"], dict2["sigma"] | |
sigma1 += np.eye(sigma1.shape[0]) * eps | |
sigma2 += np.eye(sigma2.shape[0]) * eps | |
ssdiff = np.sum((mu1 - mu2) ** 2.0) | |
covmean = sqrtm(sigma1.dot(sigma2)) | |
# 由于数值问题,有时可能会得到复数,只取实部 | |
if np.iscomplexobj(covmean): | |
covmean = covmean.real | |
fid = ssdiff + np.trace(sigma1 + sigma2 - 2.0 * covmean) | |
return fid | |
# Todo: AudioLDM | |
# def generate_features_with_AudioLDM_and_AST(device, processor, AST, AudioLDM_signals_directory_path, return_feature=False): | |
# diffuSynth_features = [] | |
# # Step 1: Load all wav files in AudioLDM_signals_directory_path | |
# AudioLDM_signals = [] | |
# signal_lengths = set() | |
# for file_name in os.listdir(AudioLDM_signals_directory_path): | |
# if file_name.endswith('.wav'): | |
# file_path = os.path.join(AudioLDM_signals_directory_path, file_name) | |
# signal, sr = librosa.load(file_path, sr=16000) # Load audio file with sampling rate 16000 | |
# # Normalize | |
# AudioLDM_signals.append(rms_normalize(signal)) | |
# signal_lengths.add(len(signal)) | |
# # Step 2: Check if all signals have the same length | |
# if len(signal_lengths) != 1: | |
# raise ValueError("Not all signals have the same length. Please ensure all audio files are of the same length.") | |
# # Step 3: Reshape to signal_batches [number_batches, batch_size=8, signal_length] | |
# batch_size = 8 | |
# signal_length = signal_lengths.pop() # All lengths are the same, get one of them | |
# # Create batches | |
# signal_batches = [AudioLDM_signals[i:i + batch_size] for i in range(0, len(AudioLDM_signals), batch_size)] | |
# for signal_batch in tqdm(signal_batches): | |
# features = ASTaudio2feature(device, signal_batch, processor, AST, sampling_rate=16000) | |
# diffuSynth_features.extend(features) | |
# if return_feature: | |
# return diffuSynth_features | |
# else: | |
# mu, sigma = calculate_statistics(diffuSynth_features) | |
# return {"mu": mu, "sigma": sigma} | |
def generate_features_with_AudioLDM_and_AST(device, processor, AST, AudioLDM_signals_directory_path, return_feature=False): | |
diffuSynth_features = [] | |
# Step 1: Load all wav files in AudioLDM_signals_directory_path | |
AudioLDM_signals = [] | |
signal_lengths = set() | |
target_length = 4 * 16000 # 4 seconds * 16000 samples per second | |
for file_name in os.listdir(AudioLDM_signals_directory_path): | |
if file_name.endswith('.wav') and not file_name.startswith('._'): | |
file_path = os.path.join(AudioLDM_signals_directory_path, file_name) | |
try: | |
signal, sr = librosa.load(file_path, sr=16000) # Load audio file with sampling rate 16000 | |
if len(signal) >= target_length: | |
signal = signal[:target_length] # Take only the first 4 seconds | |
else: | |
raise ValueError(f"The file {file_name} is shorter than 4 seconds.") | |
# Normalize | |
AudioLDM_signals.append(rms_normalize(signal)) | |
signal_lengths.add(len(signal)) | |
except Exception as e: | |
print(f"Error loading {file_name}: {e}") | |
# Step 2: Check if all signals have the same length | |
if len(signal_lengths) != 1: | |
raise ValueError("Not all signals have the same length. Please ensure all audio files are of the same length.") | |
# Step 3: Reshape to signal_batches [number_batches, batch_size=8, signal_length] | |
batch_size = 8 | |
signal_length = signal_lengths.pop() # All lengths are the same, get one of them | |
# Create batches | |
signal_batches = [AudioLDM_signals[i:i + batch_size] for i in range(0, len(AudioLDM_signals), batch_size)] | |
for signal_batch in tqdm(signal_batches): | |
features = ASTaudio2feature(device, signal_batch, processor, AST, sampling_rate=16000) | |
diffuSynth_features.extend(features) | |
if return_feature: | |
return diffuSynth_features | |
else: | |
mu, sigma = calculate_statistics(diffuSynth_features) | |
return {"mu": mu, "sigma": sigma} | |
def generate_features_with_diffuSynth_and_AST(device, uNet, VAE, mmm, CLAP_tokenizer, processor, AST, num_batches, | |
positive_prompts, negative_prompts="", CFG=1, sample_steps=10, task="spectrograms", return_feature=False): | |
diffuSynth_features = [] | |
if task == "spectrograms": | |
pipe = sample_pipeline | |
elif task == "STFT": | |
pipe = sample_pipeline_STFT | |
else: | |
raise NotImplementedError | |
for _ in tqdm(range(num_batches)): | |
quantized_latent_representations, reconstruction_batch, signals = pipe(device, uNet, VAE, mmm, | |
CLAP_tokenizer, | |
positive_prompts=positive_prompts, | |
negative_prompts=negative_prompts, | |
batchsize=8, | |
sample_steps=sample_steps, | |
CFG=CFG, seed=None, | |
return_latent=False) | |
features = ASTaudio2feature(device, signals, processor, AST, sampling_rate=16000) | |
diffuSynth_features.extend(features) | |
if return_feature: | |
return diffuSynth_features | |
else: | |
mu, sigma = calculate_statistics(diffuSynth_features) | |
return {"mu": mu, "sigma": sigma} | |
def generate_features_with_GAN_and_AST(device, gan_generator, VAE, mmm, CLAP_tokenizer, processor, AST, num_batches, | |
positive_prompts, negative_prompts="", CFG=1, sample_steps=10, task="spectrograms", return_feature=False): | |
diffuSynth_features = [] | |
if task == "spectrograms": | |
pipe = sample_pipeline_GAN | |
elif task == "STFT": | |
pipe = sample_pipeline_GAN_STFT | |
else: | |
raise NotImplementedError | |
for _ in tqdm(range(num_batches)): | |
quantized_latent_representations, reconstruction_batch, signals = pipe(device, gan_generator, VAE, mmm, | |
CLAP_tokenizer, | |
positive_prompts=positive_prompts, | |
negative_prompts=negative_prompts, | |
batchsize=8, | |
sample_steps=sample_steps, | |
CFG=CFG, seed=None, | |
return_latent=False) | |
features = ASTaudio2feature(device, signals, processor, AST, sampling_rate=16000) | |
diffuSynth_features.extend(features) | |
if return_feature: | |
return diffuSynth_features | |
else: | |
mu, sigma = calculate_statistics(diffuSynth_features) | |
return {"mu": mu, "sigma": sigma} | |
def get_FD(train_features, device, uNet, VAE, mmm, CLAP_tokenizer, processor, AST, num_batches, positive_prompts, | |
negative_prompts="", CFG=1, sample_steps=10): | |
diffuSynth_features = generate_features_with_diffuSynth_and_AST(device, uNet, VAE, mmm, CLAP_tokenizer, processor, | |
AST, num_batches, positive_prompts, | |
negative_prompts=negative_prompts, CFG=CFG, | |
sample_steps=sample_steps) | |
mu_real, sigma_real = calculate_statistics(train_features) | |
mu_gen, sigma_gen = calculate_statistics(diffuSynth_features) | |
fid_score = calculate_fid(mu_real, sigma_real, mu_gen, sigma_gen) | |
print('FID score:', fid_score) | |
def get_fid_score(feature1, features2): | |
mu_real, sigma_real = calculate_statistics(feature1) | |
mu_gen, sigma_gen = calculate_statistics(features2) | |
fid_score = calculate_fid(mu_real, sigma_real, mu_gen, sigma_gen) | |
# print('FID score:', fid_score) | |
return fid_score | |
def calculate_fid_matrix(features_list_1, features_list_2, get_fid_score): | |
# 初始化一个矩阵来存储FID分数 | |
# 矩阵的大小为 len(features_list_1) x len(features_list_2) | |
fid_scores = [[0 for _ in range(len(features_list_2))] for _ in range(len(features_list_1))] | |
# 遍历两个列表,并计算每一对特征集合的FID分数 | |
for i, feature1 in enumerate(features_list_1): | |
for j, feature2 in enumerate(features_list_2): | |
fid_scores[i][j] = get_fid_score(feature1, feature2) | |
return fid_scores | |
def save_AST_feature(key, mu, sigma, path='results/AST_metric/pre_calculated_features/AST_features.json'): | |
# 尝试打开并读取现有的JSON文件 | |
try: | |
with open(path, 'r') as file: | |
data = json.load(file) | |
except FileNotFoundError: | |
# 如果文件不存在,创建一个新的字典 | |
data = {} | |
if isinstance(mu, np.ndarray): | |
mu = mu.tolist() | |
if isinstance(sigma, np.ndarray): | |
sigma = sigma.tolist() | |
# 添加新数据 | |
data[key] = {"mu": mu, "sigma": sigma} | |
# 将更新后的数据写回文件 | |
with open(path, 'w') as file: | |
json.dump(data, file, indent=4) | |
def read_AST_features(path='results/AST_metric/pre_calculated_features/AST_features.json'): | |
try: | |
# 尝试打开并读取JSON文件 | |
with open(path, 'r') as file: | |
AST_features = json.load(file) | |
for AST_feature_name in AST_features.keys(): | |
AST_features[AST_feature_name]["mu"] = np.array(AST_features[AST_feature_name]["mu"]) | |
AST_features[AST_feature_name]["sigma"] = np.array(AST_features[AST_feature_name]["sigma"]) | |
return AST_features | |
except FileNotFoundError: | |
# 如果文件不存在,返回一个空字典 | |
print(f"文件 {path} 未找到.") | |
return {} | |
except json.JSONDecodeError: | |
# 如果文件不是有效的JSON,返回一个空字典 | |
print(f"文件 {path} 不是有效的JSON格式.") | |
return {} |