Spaces:

WeixuanYuan
/

DiffuSynthV0.2

Running

File size: 12,638 Bytes

ae1bdf7

import json
import os

import librosa
import numpy as np
import torch
from tqdm import tqdm
from scipy.linalg import sqrtm

from metrics.pipelines import sample_pipeline, sample_pipeline_GAN
from metrics.pipelines_STFT import sample_pipeline_STFT, sample_pipeline_GAN_STFT
from tools import rms_normalize


def ASTaudio2feature(device, signal, processor, AST, sampling_rate):
    # audio file is decoded on the fly
    inputs = processor(signal, sampling_rate=sampling_rate, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = AST(**inputs)

    last_hidden_states = outputs.last_hidden_state[:, 0, :].to("cpu").detach().numpy()
    return last_hidden_states


# 计算两个numpy数组的均值和协方差矩阵
def calculate_statistics(features):
    mu = np.mean(features, axis=0)
    sigma = np.cov(features, rowvar=False)
    return mu, sigma


# 计算FID
def calculate_fid(mu1, sigma1, mu2, sigma2, eps=1e-6):
    # 在协方差矩阵对角线上添加一个小的正值
    sigma1 += np.eye(sigma1.shape[0]) * eps
    sigma2 += np.eye(sigma2.shape[0]) * eps

    ssdiff = np.sum((mu1 - mu2) ** 2.0)
    covmean = sqrtm(sigma1.dot(sigma2))

    # 由于数值问题，有时可能会得到复数，只取实部
    if np.iscomplexobj(covmean):
        covmean = covmean.real

    fid = ssdiff + np.trace(sigma1 + sigma2 - 2.0 * covmean)
    return fid


# 计算FID
def calculate_fid_dict(dict1, dict2, eps=1e-6):
    # 在协方差矩阵对角线上添加一个小的正值
    mu1, sigma1 = dict1["mu"], dict1["sigma"]
    mu2, sigma2 = dict2["mu"], dict2["sigma"]
    sigma1 += np.eye(sigma1.shape[0]) * eps
    sigma2 += np.eye(sigma2.shape[0]) * eps

    ssdiff = np.sum((mu1 - mu2) ** 2.0)
    covmean = sqrtm(sigma1.dot(sigma2))

    # 由于数值问题，有时可能会得到复数，只取实部
    if np.iscomplexobj(covmean):
        covmean = covmean.real

    fid = ssdiff + np.trace(sigma1 + sigma2 - 2.0 * covmean)
    return fid


# Todo: AudioLDM
# def generate_features_with_AudioLDM_and_AST(device, processor, AST, AudioLDM_signals_directory_path, return_feature=False):

#     diffuSynth_features = []

#     # Step 1: Load all wav files in AudioLDM_signals_directory_path
#     AudioLDM_signals = []
#     signal_lengths = set()

#     for file_name in os.listdir(AudioLDM_signals_directory_path):
#         if file_name.endswith('.wav'):
#             file_path = os.path.join(AudioLDM_signals_directory_path, file_name)
#             signal, sr = librosa.load(file_path, sr=16000)  # Load audio file with sampling rate 16000
#             # Normalize
#             AudioLDM_signals.append(rms_normalize(signal))
#             signal_lengths.add(len(signal))

#     # Step 2: Check if all signals have the same length
#     if len(signal_lengths) != 1:
#         raise ValueError("Not all signals have the same length. Please ensure all audio files are of the same length.")

#     # Step 3: Reshape to signal_batches [number_batches, batch_size=8, signal_length]
#     batch_size = 8
#     signal_length = signal_lengths.pop()  # All lengths are the same, get one of them

#     # Create batches
#     signal_batches = [AudioLDM_signals[i:i + batch_size] for i in range(0, len(AudioLDM_signals), batch_size)]

#     for signal_batch in tqdm(signal_batches):

#         features = ASTaudio2feature(device, signal_batch, processor, AST, sampling_rate=16000)
#         diffuSynth_features.extend(features)

#     if return_feature:
#         return diffuSynth_features
#     else:
#         mu, sigma = calculate_statistics(diffuSynth_features)
#         return {"mu": mu,  "sigma": sigma}

def generate_features_with_AudioLDM_and_AST(device, processor, AST, AudioLDM_signals_directory_path, return_feature=False):

    diffuSynth_features = []

    # Step 1: Load all wav files in AudioLDM_signals_directory_path
    AudioLDM_signals = []
    signal_lengths = set()
    target_length = 4 * 16000  # 4 seconds * 16000 samples per second

    for file_name in os.listdir(AudioLDM_signals_directory_path):
        if file_name.endswith('.wav') and not file_name.startswith('._'):
            file_path = os.path.join(AudioLDM_signals_directory_path, file_name)
            try:
                signal, sr = librosa.load(file_path, sr=16000)  # Load audio file with sampling rate 16000
                if len(signal) >= target_length:
                    signal = signal[:target_length]  # Take only the first 4 seconds
                else:
                    raise ValueError(f"The file {file_name} is shorter than 4 seconds.")
                # Normalize
                AudioLDM_signals.append(rms_normalize(signal))
                signal_lengths.add(len(signal))
            except Exception as e:
                print(f"Error loading {file_name}: {e}")

    # Step 2: Check if all signals have the same length
    if len(signal_lengths) != 1:
        raise ValueError("Not all signals have the same length. Please ensure all audio files are of the same length.")

    # Step 3: Reshape to signal_batches [number_batches, batch_size=8, signal_length]
    batch_size = 8
    signal_length = signal_lengths.pop()  # All lengths are the same, get one of them

    # Create batches
    signal_batches = [AudioLDM_signals[i:i + batch_size] for i in range(0, len(AudioLDM_signals), batch_size)]

    for signal_batch in tqdm(signal_batches):
        features = ASTaudio2feature(device, signal_batch, processor, AST, sampling_rate=16000)
        diffuSynth_features.extend(features)

    if return_feature:
        return diffuSynth_features
    else:
        mu, sigma = calculate_statistics(diffuSynth_features)
        return {"mu": mu, "sigma": sigma}




def generate_features_with_diffuSynth_and_AST(device, uNet, VAE, mmm, CLAP_tokenizer, processor, AST, num_batches,

                                              positive_prompts, negative_prompts="", CFG=1, sample_steps=10, task="spectrograms", return_feature=False):
    diffuSynth_features = []

    if task == "spectrograms":
        pipe = sample_pipeline
    elif task == "STFT":
        pipe = sample_pipeline_STFT
    else:
        raise NotImplementedError

    for _ in tqdm(range(num_batches)):
        quantized_latent_representations, reconstruction_batch, signals = pipe(device, uNet, VAE, mmm,
                                                                                          CLAP_tokenizer,
                                                                                          positive_prompts=positive_prompts,
                                                                                          negative_prompts=negative_prompts,
                                                                                          batchsize=8,
                                                                                          sample_steps=sample_steps,
                                                                                          CFG=CFG, seed=None,
                                                                                          return_latent=False)

        features = ASTaudio2feature(device, signals, processor, AST, sampling_rate=16000)
        diffuSynth_features.extend(features)

    if return_feature:
        return diffuSynth_features
    else:
        mu, sigma = calculate_statistics(diffuSynth_features)
        return {"mu": mu,  "sigma": sigma}


def generate_features_with_GAN_and_AST(device, gan_generator, VAE, mmm, CLAP_tokenizer, processor, AST, num_batches,

                                              positive_prompts, negative_prompts="", CFG=1, sample_steps=10, task="spectrograms", return_feature=False):
    diffuSynth_features = []

    if task == "spectrograms":
        pipe = sample_pipeline_GAN
    elif task == "STFT":
        pipe = sample_pipeline_GAN_STFT
    else:
        raise NotImplementedError

    for _ in tqdm(range(num_batches)):
        quantized_latent_representations, reconstruction_batch, signals = pipe(device, gan_generator, VAE, mmm,
                                                                                          CLAP_tokenizer,
                                                                                          positive_prompts=positive_prompts,
                                                                                          negative_prompts=negative_prompts,
                                                                                          batchsize=8,
                                                                                          sample_steps=sample_steps,
                                                                                          CFG=CFG, seed=None,
                                                                                          return_latent=False)

        features = ASTaudio2feature(device, signals, processor, AST, sampling_rate=16000)
        diffuSynth_features.extend(features)

    if return_feature:
        return diffuSynth_features
    else:
        mu, sigma = calculate_statistics(diffuSynth_features)
        return {"mu": mu,  "sigma": sigma}


def get_FD(train_features, device, uNet, VAE, mmm, CLAP_tokenizer, processor, AST, num_batches, positive_prompts,

           negative_prompts="", CFG=1, sample_steps=10):
    diffuSynth_features = generate_features_with_diffuSynth_and_AST(device, uNet, VAE, mmm, CLAP_tokenizer, processor,
                                                                    AST, num_batches, positive_prompts,
                                                                    negative_prompts=negative_prompts, CFG=CFG,
                                                                    sample_steps=sample_steps)

    mu_real, sigma_real = calculate_statistics(train_features)
    mu_gen, sigma_gen = calculate_statistics(diffuSynth_features)

    fid_score = calculate_fid(mu_real, sigma_real, mu_gen, sigma_gen)
    print('FID score:', fid_score)


def get_fid_score(feature1, features2):
    mu_real, sigma_real = calculate_statistics(feature1)
    mu_gen, sigma_gen = calculate_statistics(features2)

    fid_score = calculate_fid(mu_real, sigma_real, mu_gen, sigma_gen)
    # print('FID score:', fid_score)
    return fid_score


def calculate_fid_matrix(features_list_1, features_list_2, get_fid_score):
    # 初始化一个矩阵来存储FID分数
    # 矩阵的大小为 len(features_list_1) x len(features_list_2)
    fid_scores = [[0 for _ in range(len(features_list_2))] for _ in range(len(features_list_1))]

    # 遍历两个列表，并计算每一对特征集合的FID分数
    for i, feature1 in enumerate(features_list_1):
        for j, feature2 in enumerate(features_list_2):
            fid_scores[i][j] = get_fid_score(feature1, feature2)

    return fid_scores


def save_AST_feature(key, mu, sigma, path='results/AST_metric/pre_calculated_features/AST_features.json'):
    # 尝试打开并读取现有的JSON文件
    try:
        with open(path, 'r') as file:
            data = json.load(file)
    except FileNotFoundError:
        # 如果文件不存在，创建一个新的字典
        data = {}

    if isinstance(mu, np.ndarray):
        mu = mu.tolist()
    if isinstance(sigma, np.ndarray):
        sigma = sigma.tolist()

    # 添加新数据
    data[key] = {"mu": mu, "sigma": sigma}

    # 将更新后的数据写回文件
    with open(path, 'w') as file:
        json.dump(data, file, indent=4)


def read_AST_features(path='results/AST_metric/pre_calculated_features/AST_features.json'):
    try:
        # 尝试打开并读取JSON文件
        with open(path, 'r') as file:
            AST_features = json.load(file)

            for AST_feature_name in AST_features.keys():
                AST_features[AST_feature_name]["mu"] = np.array(AST_features[AST_feature_name]["mu"])
                AST_features[AST_feature_name]["sigma"] = np.array(AST_features[AST_feature_name]["sigma"])

            return AST_features
    except FileNotFoundError:
        # 如果文件不存在，返回一个空字典
        print(f"文件 {path} 未找到.")
        return {}
    except json.JSONDecodeError:
        # 如果文件不是有效的JSON，返回一个空字典
        print(f"文件 {path} 不是有效的JSON格式.")
        return {}