Spaces:
Running
Running
File size: 12,638 Bytes
ae1bdf7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 |
import json
import os
import librosa
import numpy as np
import torch
from tqdm import tqdm
from scipy.linalg import sqrtm
from metrics.pipelines import sample_pipeline, sample_pipeline_GAN
from metrics.pipelines_STFT import sample_pipeline_STFT, sample_pipeline_GAN_STFT
from tools import rms_normalize
def ASTaudio2feature(device, signal, processor, AST, sampling_rate):
# audio file is decoded on the fly
inputs = processor(signal, sampling_rate=sampling_rate, return_tensors="pt").to(device)
with torch.no_grad():
outputs = AST(**inputs)
last_hidden_states = outputs.last_hidden_state[:, 0, :].to("cpu").detach().numpy()
return last_hidden_states
# 计算两个numpy数组的均值和协方差矩阵
def calculate_statistics(features):
mu = np.mean(features, axis=0)
sigma = np.cov(features, rowvar=False)
return mu, sigma
# 计算FID
def calculate_fid(mu1, sigma1, mu2, sigma2, eps=1e-6):
# 在协方差矩阵对角线上添加一个小的正值
sigma1 += np.eye(sigma1.shape[0]) * eps
sigma2 += np.eye(sigma2.shape[0]) * eps
ssdiff = np.sum((mu1 - mu2) ** 2.0)
covmean = sqrtm(sigma1.dot(sigma2))
# 由于数值问题,有时可能会得到复数,只取实部
if np.iscomplexobj(covmean):
covmean = covmean.real
fid = ssdiff + np.trace(sigma1 + sigma2 - 2.0 * covmean)
return fid
# 计算FID
def calculate_fid_dict(dict1, dict2, eps=1e-6):
# 在协方差矩阵对角线上添加一个小的正值
mu1, sigma1 = dict1["mu"], dict1["sigma"]
mu2, sigma2 = dict2["mu"], dict2["sigma"]
sigma1 += np.eye(sigma1.shape[0]) * eps
sigma2 += np.eye(sigma2.shape[0]) * eps
ssdiff = np.sum((mu1 - mu2) ** 2.0)
covmean = sqrtm(sigma1.dot(sigma2))
# 由于数值问题,有时可能会得到复数,只取实部
if np.iscomplexobj(covmean):
covmean = covmean.real
fid = ssdiff + np.trace(sigma1 + sigma2 - 2.0 * covmean)
return fid
# Todo: AudioLDM
# def generate_features_with_AudioLDM_and_AST(device, processor, AST, AudioLDM_signals_directory_path, return_feature=False):
# diffuSynth_features = []
# # Step 1: Load all wav files in AudioLDM_signals_directory_path
# AudioLDM_signals = []
# signal_lengths = set()
# for file_name in os.listdir(AudioLDM_signals_directory_path):
# if file_name.endswith('.wav'):
# file_path = os.path.join(AudioLDM_signals_directory_path, file_name)
# signal, sr = librosa.load(file_path, sr=16000) # Load audio file with sampling rate 16000
# # Normalize
# AudioLDM_signals.append(rms_normalize(signal))
# signal_lengths.add(len(signal))
# # Step 2: Check if all signals have the same length
# if len(signal_lengths) != 1:
# raise ValueError("Not all signals have the same length. Please ensure all audio files are of the same length.")
# # Step 3: Reshape to signal_batches [number_batches, batch_size=8, signal_length]
# batch_size = 8
# signal_length = signal_lengths.pop() # All lengths are the same, get one of them
# # Create batches
# signal_batches = [AudioLDM_signals[i:i + batch_size] for i in range(0, len(AudioLDM_signals), batch_size)]
# for signal_batch in tqdm(signal_batches):
# features = ASTaudio2feature(device, signal_batch, processor, AST, sampling_rate=16000)
# diffuSynth_features.extend(features)
# if return_feature:
# return diffuSynth_features
# else:
# mu, sigma = calculate_statistics(diffuSynth_features)
# return {"mu": mu, "sigma": sigma}
def generate_features_with_AudioLDM_and_AST(device, processor, AST, AudioLDM_signals_directory_path, return_feature=False):
diffuSynth_features = []
# Step 1: Load all wav files in AudioLDM_signals_directory_path
AudioLDM_signals = []
signal_lengths = set()
target_length = 4 * 16000 # 4 seconds * 16000 samples per second
for file_name in os.listdir(AudioLDM_signals_directory_path):
if file_name.endswith('.wav') and not file_name.startswith('._'):
file_path = os.path.join(AudioLDM_signals_directory_path, file_name)
try:
signal, sr = librosa.load(file_path, sr=16000) # Load audio file with sampling rate 16000
if len(signal) >= target_length:
signal = signal[:target_length] # Take only the first 4 seconds
else:
raise ValueError(f"The file {file_name} is shorter than 4 seconds.")
# Normalize
AudioLDM_signals.append(rms_normalize(signal))
signal_lengths.add(len(signal))
except Exception as e:
print(f"Error loading {file_name}: {e}")
# Step 2: Check if all signals have the same length
if len(signal_lengths) != 1:
raise ValueError("Not all signals have the same length. Please ensure all audio files are of the same length.")
# Step 3: Reshape to signal_batches [number_batches, batch_size=8, signal_length]
batch_size = 8
signal_length = signal_lengths.pop() # All lengths are the same, get one of them
# Create batches
signal_batches = [AudioLDM_signals[i:i + batch_size] for i in range(0, len(AudioLDM_signals), batch_size)]
for signal_batch in tqdm(signal_batches):
features = ASTaudio2feature(device, signal_batch, processor, AST, sampling_rate=16000)
diffuSynth_features.extend(features)
if return_feature:
return diffuSynth_features
else:
mu, sigma = calculate_statistics(diffuSynth_features)
return {"mu": mu, "sigma": sigma}
def generate_features_with_diffuSynth_and_AST(device, uNet, VAE, mmm, CLAP_tokenizer, processor, AST, num_batches,
positive_prompts, negative_prompts="", CFG=1, sample_steps=10, task="spectrograms", return_feature=False):
diffuSynth_features = []
if task == "spectrograms":
pipe = sample_pipeline
elif task == "STFT":
pipe = sample_pipeline_STFT
else:
raise NotImplementedError
for _ in tqdm(range(num_batches)):
quantized_latent_representations, reconstruction_batch, signals = pipe(device, uNet, VAE, mmm,
CLAP_tokenizer,
positive_prompts=positive_prompts,
negative_prompts=negative_prompts,
batchsize=8,
sample_steps=sample_steps,
CFG=CFG, seed=None,
return_latent=False)
features = ASTaudio2feature(device, signals, processor, AST, sampling_rate=16000)
diffuSynth_features.extend(features)
if return_feature:
return diffuSynth_features
else:
mu, sigma = calculate_statistics(diffuSynth_features)
return {"mu": mu, "sigma": sigma}
def generate_features_with_GAN_and_AST(device, gan_generator, VAE, mmm, CLAP_tokenizer, processor, AST, num_batches,
positive_prompts, negative_prompts="", CFG=1, sample_steps=10, task="spectrograms", return_feature=False):
diffuSynth_features = []
if task == "spectrograms":
pipe = sample_pipeline_GAN
elif task == "STFT":
pipe = sample_pipeline_GAN_STFT
else:
raise NotImplementedError
for _ in tqdm(range(num_batches)):
quantized_latent_representations, reconstruction_batch, signals = pipe(device, gan_generator, VAE, mmm,
CLAP_tokenizer,
positive_prompts=positive_prompts,
negative_prompts=negative_prompts,
batchsize=8,
sample_steps=sample_steps,
CFG=CFG, seed=None,
return_latent=False)
features = ASTaudio2feature(device, signals, processor, AST, sampling_rate=16000)
diffuSynth_features.extend(features)
if return_feature:
return diffuSynth_features
else:
mu, sigma = calculate_statistics(diffuSynth_features)
return {"mu": mu, "sigma": sigma}
def get_FD(train_features, device, uNet, VAE, mmm, CLAP_tokenizer, processor, AST, num_batches, positive_prompts,
negative_prompts="", CFG=1, sample_steps=10):
diffuSynth_features = generate_features_with_diffuSynth_and_AST(device, uNet, VAE, mmm, CLAP_tokenizer, processor,
AST, num_batches, positive_prompts,
negative_prompts=negative_prompts, CFG=CFG,
sample_steps=sample_steps)
mu_real, sigma_real = calculate_statistics(train_features)
mu_gen, sigma_gen = calculate_statistics(diffuSynth_features)
fid_score = calculate_fid(mu_real, sigma_real, mu_gen, sigma_gen)
print('FID score:', fid_score)
def get_fid_score(feature1, features2):
mu_real, sigma_real = calculate_statistics(feature1)
mu_gen, sigma_gen = calculate_statistics(features2)
fid_score = calculate_fid(mu_real, sigma_real, mu_gen, sigma_gen)
# print('FID score:', fid_score)
return fid_score
def calculate_fid_matrix(features_list_1, features_list_2, get_fid_score):
# 初始化一个矩阵来存储FID分数
# 矩阵的大小为 len(features_list_1) x len(features_list_2)
fid_scores = [[0 for _ in range(len(features_list_2))] for _ in range(len(features_list_1))]
# 遍历两个列表,并计算每一对特征集合的FID分数
for i, feature1 in enumerate(features_list_1):
for j, feature2 in enumerate(features_list_2):
fid_scores[i][j] = get_fid_score(feature1, feature2)
return fid_scores
def save_AST_feature(key, mu, sigma, path='results/AST_metric/pre_calculated_features/AST_features.json'):
# 尝试打开并读取现有的JSON文件
try:
with open(path, 'r') as file:
data = json.load(file)
except FileNotFoundError:
# 如果文件不存在,创建一个新的字典
data = {}
if isinstance(mu, np.ndarray):
mu = mu.tolist()
if isinstance(sigma, np.ndarray):
sigma = sigma.tolist()
# 添加新数据
data[key] = {"mu": mu, "sigma": sigma}
# 将更新后的数据写回文件
with open(path, 'w') as file:
json.dump(data, file, indent=4)
def read_AST_features(path='results/AST_metric/pre_calculated_features/AST_features.json'):
try:
# 尝试打开并读取JSON文件
with open(path, 'r') as file:
AST_features = json.load(file)
for AST_feature_name in AST_features.keys():
AST_features[AST_feature_name]["mu"] = np.array(AST_features[AST_feature_name]["mu"])
AST_features[AST_feature_name]["sigma"] = np.array(AST_features[AST_feature_name]["sigma"])
return AST_features
except FileNotFoundError:
# 如果文件不存在,返回一个空字典
print(f"文件 {path} 未找到.")
return {}
except json.JSONDecodeError:
# 如果文件不是有效的JSON,返回一个空字典
print(f"文件 {path} 不是有效的JSON格式.")
return {} |