Spaces:

codejin
/

diffsingerkr

Build error

App Files Files Community

codejin commited on Feb 18, 2023

Commit

67d041f

•

1 Parent(s): 749e9f5

initial commit

Browse files

Files changed (23) hide show

.gitattributes +1 -0
.gitignore +1 -0
Arg_Parser.py +30 -0
Checkpoint/S_200000.pt +3 -0
Datasets.py +146 -0
Hyper_Parameters.yaml +45 -0
Inference.py +168 -0
Modules/Diffusion.py +403 -0
Modules/Layer.py +317 -0
Modules/Modules.py +265 -0
Pattern_Generator.py +64 -0
README.md +4 -4
YAML/Genre_Info.yaml +1 -0
YAML/Log_Energy_Info.yaml +3 -0
YAML/Log_F0_Info.yaml +3 -0
YAML/Mel_Range_Info.yaml +3 -0
YAML/Singer_Info.yaml +1 -0
YAML/Spectrogram_Range_Info.yaml +3 -0
YAML/Token.yaml +71 -0
app.py +81 -0
meldataset.py +230 -0
requirements.txt +7 -0
vocoder.pts +3 -0

.gitattributes CHANGED Viewed

@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.pts filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ *.pyc

Arg_Parser.py ADDED Viewed

	@@ -0,0 +1,30 @@

+from argparse import Namespace
+def Recursive_Parse(args_dict):
+    parsed_dict = {}
+    for key, value in args_dict.items():
+        if isinstance(value, dict):
+            value = Recursive_Parse(value)
+        parsed_dict[key]= value
+    args = Namespace()
+    args.__dict__ = parsed_dict
+    return args
+def To_Non_Recursive_Dict(
+    args: Namespace
+    ):
+    parsed_dict = {}
+    for key, value in args.__dict__.items():
+        if isinstance(value, Namespace):
+            value_dict = To_Non_Recursive_Dict(value)
+            for sub_key, sub_value in value_dict.items():
+                parsed_dict[f'{key}.{sub_key}'] = sub_value
+        else:
+            parsed_dict[key] = value
+    return parsed_dict

Checkpoint/S_200000.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6482992a43b8a98554e7ef9e487a381c2717c5828d564e6dfc6cac16a0e16092
+size 682529563

Datasets.py ADDED Viewed

	@@ -0,0 +1,146 @@

+from argparse import Namespace
+import torch
+import numpy as np
+import pickle, os, logging
+from typing import Dict, List, Optional
+import hgtk
+from Pattern_Generator import Convert_Feature_Based_Music, Expand_by_Duration
+def Decompose(syllable: str):
+    onset, nucleus, coda = hgtk.letter.decompose(syllable)
+    coda += '_'
+    return onset, nucleus, coda
+def Lyric_to_Token(lyric: List[str], token_dict: Dict[str, int]):
+    return [
+        token_dict[letter]
+        for letter in list(lyric)
+        ]
+def Token_Stack(tokens: List[List[int]], token_dict: Dict[str, int], max_length: Optional[int]= None):
+    max_token_length = max_length or max([len(token) for token in tokens])
+    tokens = np.stack(
+        [np.pad(token[:max_token_length], [0, max_token_length - len(token[:max_token_length])], constant_values= token_dict['<X>']) for token in tokens],
+        axis= 0
+        )
+    return tokens
+def Note_Stack(notes: List[List[int]], max_length: Optional[int]= None):
+    max_note_length = max_length or max([len(note) for note in notes])
+    notes = np.stack(
+        [np.pad(note[:max_note_length], [0, max_note_length - len(note[:max_note_length])], constant_values= 0) for note in notes],
+        axis= 0
+        )
+    return notes
+def Duration_Stack(durations: List[List[int]], max_length: Optional[int]= None):
+    max_duration_length = max_length or max([len(duration) for duration in durations])
+    durations = np.stack(
+        [np.pad(duration[:max_duration_length], [0, max_duration_length - len(duration[:max_duration_length])], constant_values= 0) for duration in durations],
+        axis= 0
+        )
+    return durations
+def Feature_Stack(features: List[np.array], max_length: Optional[int]= None):
+    max_feature_length = max_length or max([feature.shape[0] for feature in features])
+    features = np.stack(
+        [np.pad(feature, [[0, max_feature_length - feature.shape[0]], [0, 0]], constant_values= -1.0) for feature in features],
+        axis= 0
+        )
+    return features
+def Log_F0_Stack(log_f0s: List[np.array], max_length: int= None):
+    max_log_f0_length = max_length or max([len(log_f0) for log_f0 in log_f0s])
+    log_f0s = np.stack(
+        [np.pad(log_f0, [0, max_log_f0_length - len(log_f0)], constant_values= 0.0) for log_f0 in log_f0s],
+        axis= 0
+        )
+    return log_f0s
+class Inference_Dataset(torch.utils.data.Dataset):
+    def __init__(
+        self,
+        token_dict: Dict[str, int],
+        singer_info_dict: Dict[str, int],
+        genre_info_dict: Dict[str, int],
+        durations: List[List[float]],
+        lyrics: List[List[str]],
+        notes: List[List[int]],
+        singers: List[str],
+        genres: List[str],
+        sample_rate: int,
+        frame_shift: int,
+        equality_duration: bool= False,
+        consonant_duration: int= 3
+        ):
+        super().__init__()
+        self.token_dict = token_dict
+        self.singer_info_dict = singer_info_dict
+        self.genre_info_dict = genre_info_dict
+        self.equality_duration = equality_duration
+        self.consonant_duration = consonant_duration
+        self.patterns = []
+        for index, (duration, lyric, note, singer, genre) in enumerate(zip(durations, lyrics, notes, singers, genres)):
+            if not singer in self.singer_info_dict.keys():
+                logging.warn('The singer \'{}\' is incorrect. The pattern \'{}\' is ignoired.'.format(singer, index))
+                continue
+            if not genre in self.genre_info_dict.keys():
+                logging.warn('The genre \'{}\' is incorrect. The pattern \'{}\' is ignoired.'.format(genre, index))
+                continue
+            music = [x for x in zip(duration, lyric, note)]
+            singer_label = singer
+            text = lyric
+            lyric, note, duration = Convert_Feature_Based_Music(
+                music= music,
+                sample_rate= sample_rate,
+                frame_shift= frame_shift,
+                consonant_duration= consonant_duration,
+                equality_duration= equality_duration
+                )
+            lyric_expand, note_expand, duration_expand = Expand_by_Duration(lyric, note, duration)
+            singer = self.singer_info_dict[singer]
+            genre = self.genre_info_dict[genre]
+            self.patterns.append((lyric_expand, note_expand, duration_expand, singer, genre, singer_label, text))
+    def __getitem__(self, idx):
+        lyric, note, duration, singer, genre, singer_label, text = self.patterns[idx]
+        return Lyric_to_Token(lyric, self.token_dict), note, duration, singer, genre, singer_label, text
+    def __len__(self):
+        return len(self.patterns)
+class Inference_Collater:
+    def __init__(self,
+        token_dict: Dict[str, int]
+        ):
+        self.token_dict = token_dict
+    def __call__(self, batch):
+        tokens, notes, durations, singers, genres, singer_labels, lyrics = zip(*batch)
+        lengths = np.array([len(token) for token in tokens])
+        max_length = max(lengths)
+        tokens = Token_Stack(tokens, self.token_dict, max_length)
+        notes = Note_Stack(notes, max_length)
+        durations = Duration_Stack(durations, max_length)
+        tokens = torch.LongTensor(tokens)   # [Batch, Time]
+        notes = torch.LongTensor(notes)   # [Batch, Time]
+        durations = torch.LongTensor(durations)   # [Batch, Time]
+        lengths = torch.LongTensor(lengths)   # [Batch]
+        singers = torch.LongTensor(singers)  # [Batch]
+        genres = torch.LongTensor(genres)  # [Batch]
+        lyrics = [''.join([(x if x != '<X>' else ' ') for x in lyric]) for lyric in lyrics]
+        return tokens, notes, durations, lengths, singers, genres, singer_labels, lyrics

Hyper_Parameters.yaml ADDED Viewed

	@@ -0,0 +1,45 @@

+Sound:
+    N_FFT: 2048
+    Mel_Dim: 80
+    Frame_Length: 1024
+    Frame_Shift: 256
+    Sample_Rate: 22050
+    Mel_F_Min: 0
+    Mel_F_Max: 8000
+Feature_Type: 'Mel' #'Spectrogram', 'Mel'
+Tokens: 77
+Notes: 128
+Durations: 5000
+Genres: 1
+Singers: 1
+Duration:
+    Equality: false
+    Consonant_Duration: 3   # This is only used when Equality is False.
+Encoder:
+    Size: 384
+    ConvFFT:
+        Stack: 6
+        Head: 2
+        Dropout_Rate: 0.1
+        Conv:
+            Stack: 2
+            Kernel_Size: 5
+        FFN:
+            Kernel_Size: 17
+Diffusion:
+    Max_Step: 100
+    Size: 256
+    Kernel_Size: 5
+    Stack: 20
+Token_Path: './YAML/Token.yaml'
+Spectrogram_Range_Info_Path: './YAML/Spectrogram_Range_Info.yaml'
+Mel_Range_Info_Path: './YAML/Mel_Range_Info.yaml'
+Log_F0_Info_Path: './YAML/Log_F0_Info.yaml'
+Log_Energy_Info_Path: './YAML/Log_Energy_Info.yaml'
+Singer_Info_Path: './YAML/Singer_Info.yaml'
+Genre_Info_Path: './YAML/Genre_Info.yaml'

Inference.py ADDED Viewed

	@@ -0,0 +1,168 @@

+import torch
+import numpy as np
+import logging, yaml, os, sys, argparse, math
+import matplotlib.pyplot as plt
+from tqdm import tqdm
+from librosa import griffinlim
+from Modules.Modules import DiffSinger
+from Datasets import Inference_Dataset as Dataset, Inference_Collater as Collater
+from meldataset import spectral_de_normalize_torch
+from Arg_Parser import Recursive_Parse
+import matplotlib as mpl
+# 유니코드 깨짐현상 해결
+mpl.rcParams['axes.unicode_minus'] = False
+# 나눔고딕 폰트 적용
+plt.rcParams["font.family"] = 'NanumGothic'
+logging.basicConfig(
+    level=logging.INFO, stream=sys.stdout,
+    format= '%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s'
+    )
+class Inferencer:
+    def __init__(
+        self,
+        hp_path: str,
+        checkpoint_path: str,
+        batch_size= 1
+        ):
+        self.device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
+        self.hp = Recursive_Parse(yaml.load(
+            open(hp_path, encoding='utf-8'),
+            Loader=yaml.Loader
+            ))
+        self.model = DiffSinger(self.hp).to(self.device)
+        if self.hp.Feature_Type == 'Mel':
+            self.vocoder = torch.jit.load('vocoder.pts', map_location='cpu').to(self.device)
+        if self.hp.Feature_Type == 'Spectrogram':
+            self.feature_range_info_dict = yaml.load(open(self.hp.Spectrogram_Range_Info_Path), Loader=yaml.Loader)
+        if self.hp.Feature_Type == 'Mel':
+            self.feature_range_info_dict = yaml.load(open(self.hp.Mel_Range_Info_Path), Loader=yaml.Loader)
+        self.index_singer_dict = {
+            value: key
+            for key, value in yaml.load(open(self.hp.Singer_Info_Path), Loader=yaml.Loader).items()
+            }
+        if self.hp.Feature_Type == 'Spectrogram':
+            self.feature_size = self.hp.Sound.N_FFT // 2 + 1
+        elif self.hp.Feature_Type == 'Mel':
+            self.feature_size = self.hp.Sound.Mel_Dim
+        else:
+            raise ValueError('Unknown feature type: {}'.format(self.hp.Feature_Type))
+        self.Load_Checkpoint(checkpoint_path)
+        self.batch_size = batch_size
+    def Dataset_Generate(self, message_times_list, lyrics, notes, singers, genres):
+        token_dict = yaml.load(open(self.hp.Token_Path), Loader=yaml.Loader)
+        singer_info_dict = yaml.load(open(self.hp.Singer_Info_Path), Loader=yaml.Loader)
+        genre_info_dict = yaml.load(open(self.hp.Genre_Info_Path), Loader=yaml.Loader)
+        return torch.utils.data.DataLoader(
+            dataset= Dataset(
+                token_dict= token_dict,
+                singer_info_dict= singer_info_dict,
+                genre_info_dict= genre_info_dict,
+                durations= message_times_list,
+                lyrics= lyrics,
+                notes= notes,
+                singers= singers,
+                genres= genres,
+                sample_rate= self.hp.Sound.Sample_Rate,
+                frame_shift= self.hp.Sound.Frame_Shift,
+                equality_duration= self.hp.Duration.Equality,
+                consonant_duration= self.hp.Duration.Consonant_Duration
+                ),
+            shuffle= False,
+            collate_fn= Collater(
+                token_dict= token_dict
+                ),
+            batch_size= self.batch_size,
+            num_workers= 0,
+            pin_memory= True
+            )
+    def Load_Checkpoint(self, path):
+        state_dict = torch.load(path, map_location= 'cpu')
+        self.model.load_state_dict(state_dict['Model']['DiffSVS'])
+        self.steps = state_dict['Steps']
+        self.model.eval()
+        logging.info('Checkpoint loaded at {} steps.'.format(self.steps))
+    @torch.inference_mode()
+    def Inference_Step(self, tokens, notes, durations, lengths, singers, genres, singer_labels, ddim_steps):
+        tokens = tokens.to(self.device, non_blocking=True)
+        notes = notes.to(self.device, non_blocking=True)
+        durations = durations.to(self.device, non_blocking=True)
+        lengths = lengths.to(self.device, non_blocking=True)
+        singers = singers.to(self.device, non_blocking=True)
+        genres = genres.to(self.device, non_blocking=True)
+        linear_predictions, diffusion_predictions, _, _ = self.model(
+            tokens= tokens,
+            notes= notes,
+            durations= durations,
+            lengths= lengths,
+            genres= genres,
+            singers= singers,
+            ddim_steps= ddim_steps
+            )
+        linear_predictions = linear_predictions.clamp(-1.0, 1.0)
+        diffusion_predictions = diffusion_predictions.clamp(-1.0, 1.0)
+        linear_prediction_list, diffusion_prediction_list = [], []
+        for linear_prediction, diffusion_prediction, singer in zip(linear_predictions, diffusion_predictions, singer_labels):
+            feature_max = self.feature_range_info_dict[singer]['Max']
+            feature_min = self.feature_range_info_dict[singer]['Min']
+            linear_prediction_list.append((linear_prediction + 1.0) / 2.0 * (feature_max - feature_min) + feature_min)
+            diffusion_prediction_list.append((diffusion_prediction + 1.0) / 2.0 * (feature_max - feature_min) + feature_min)
+        linear_predictions = torch.stack(linear_prediction_list, dim= 0)
+        diffusion_predictions = torch.stack(diffusion_prediction_list, dim= 0)
+        if self.hp.Feature_Type == 'Mel':
+            audios = self.vocoder(diffusion_predictions)
+            if audios.ndim == 1:    # This is temporal because of the vocoder problem.
+                audios = audios.unsqueeze(0)
+            audios = [
+                audio[:min(length * self.hp.Sound.Frame_Shift, audio.size(0))].cpu().numpy()
+                for audio, length in zip(audios, lengths)
+                ]
+        elif self.hp.Feature_Type == 'Spectrogram':
+            audios = []
+            for prediction, length in zip(
+                diffusion_predictions,
+                lengths
+                ):
+                prediction = spectral_de_normalize_torch(prediction).cpu().numpy()
+                audio = griffinlim(prediction)[:min(prediction.size(1), length) * self.hp.Sound.Frame_Shift]
+                audio = (audio / np.abs(audio).max() * 32767.5).astype(np.int16)
+                audios.append(audio)
+        return audios
+    def Inference_Epoch(self, message_times_list, lyrics, notes, singers, genres, ddim_steps= None, use_tqdm= True):
+        dataloader = self.Dataset_Generate(
+            message_times_list= message_times_list,
+            lyrics= lyrics,
+            notes= notes,
+            singers= singers,
+            genres= genres
+            )
+        if use_tqdm:
+            dataloader = tqdm(
+                dataloader,
+                desc='[Inference]',
+                total= math.ceil(len(dataloader.dataset) / self.batch_size)
+                )
+        audios = []
+        for tokens, notes, durations, lengths, singers, genres, singer_labels, lyrics in dataloader:
+            audios.extend(self.Inference_Step(tokens, notes, durations, lengths, singers, genres, singer_labels, ddim_steps))
+        return audios

Modules/Diffusion.py ADDED Viewed

	@@ -0,0 +1,403 @@

+import torch
+import math
+from argparse import Namespace
+from typing import Optional, List, Dict, Union
+from tqdm import tqdm
+from .Layer import Conv1d, Lambda
+class Diffusion(torch.nn.Module):
+    def __init__(
+        self,
+        hyper_parameters: Namespace
+        ):
+        super().__init__()
+        self.hp = hyper_parameters
+        if self.hp.Feature_Type == 'Mel':
+            self.feature_size = self.hp.Sound.Mel_Dim
+        elif self.hp.Feature_Type == 'Spectrogram':
+            self.feature_size = self.hp.Sound.N_FFT // 2 + 1
+        self.denoiser = Denoiser(
+            hyper_parameters= self.hp
+            )
+        self.timesteps = self.hp.Diffusion.Max_Step
+        betas = torch.linspace(1e-4, 0.06, self.timesteps)
+        alphas = 1.0 - betas
+        alphas_cumprod = torch.cumprod(alphas, axis= 0)
+        alphas_cumprod_prev = torch.cat([torch.tensor([1.0]), alphas_cumprod[:-1]])
+        # calculations for diffusion q(x_t | x_{t-1}) and others
+        self.register_buffer('alphas_cumprod', alphas_cumprod)  # [Diffusion_t]
+        self.register_buffer('alphas_cumprod_prev', alphas_cumprod_prev)  # [Diffusion_t]
+        self.register_buffer('sqrt_alphas_cumprod', alphas_cumprod.sqrt())
+        self.register_buffer('sqrt_one_minus_alphas_cumprod', (1.0 - alphas_cumprod).sqrt())
+        self.register_buffer('sqrt_recip_alphas_cumprod', (1.0 / alphas_cumprod).sqrt())
+        self.register_buffer('sqrt_recipm1_alphas_cumprod', (1.0 / alphas_cumprod - 1.0).sqrt())
+        # calculations for posterior q(x_{t-1} | x_t, x_0)
+        posterior_variance = betas * (1.0 - alphas_cumprod_prev) / (1.0 - alphas_cumprod)
+        # below: log calculation clipped because the posterior variance is 0 at the beginning of the diffusion chain
+        self.register_buffer('posterior_log_variance', torch.maximum(posterior_variance, torch.tensor([1e-20])).log())
+        self.register_buffer('posterior_mean_coef1', betas * alphas_cumprod_prev.sqrt() / (1.0 - alphas_cumprod))
+        self.register_buffer('posterior_mean_coef2', (1.0 - alphas_cumprod_prev) * alphas.sqrt() / (1.0 - alphas_cumprod))
+    def forward(
+        self,
+        encodings: torch.Tensor,
+        features: torch.Tensor= None
+        ):
+        '''
+        encodings: [Batch, Enc_d, Enc_t]
+        features: [Batch, Feature_d, Feature_t]
+        feature_lengths: [Batch]
+        '''
+        if not features is None:    # train
+            diffusion_steps = torch.randint(
+                low= 0,
+                high= self.timesteps,
+                size= (encodings.size(0),),
+                dtype= torch.long,
+                device= encodings.device
+                )    # random single step
+            noises, epsilons = self.Get_Noise_Epsilon_for_Train(
+                features= features,
+                encodings= encodings,
+                diffusion_steps= diffusion_steps,
+                )
+            return None, noises, epsilons
+        else:   # inference
+            features = self.Sampling(
+                encodings= encodings,
+                )
+            return features, None, None
+    def Sampling(
+        self,
+        encodings: torch.Tensor,
+        ):
+        features = torch.randn(
+            size= (encodings.size(0), self.feature_size, encodings.size(2)),
+            device= encodings.device
+            )
+        for diffusion_step in reversed(range(self.timesteps)):
+            features = self.P_Sampling(
+                features= features,
+                encodings= encodings,
+                diffusion_steps= torch.full(
+                    size= (encodings.size(0), ),
+                    fill_value= diffusion_step,
+                    dtype= torch.long,
+                    device= encodings.device
+                    ),
+                )
+        return features
+    def P_Sampling(
+        self,
+        features: torch.Tensor,
+        encodings: torch.Tensor,
+        diffusion_steps: torch.Tensor,
+        ):
+        posterior_means, posterior_log_variances = self.Get_Posterior(
+            features= features,
+            encodings= encodings,
+            diffusion_steps= diffusion_steps,
+            )
+        noises = torch.randn_like(features) # [Batch, Feature_d, Feature_d]
+        masks = (diffusion_steps > 0).float().unsqueeze(1).unsqueeze(1) #[Batch, 1, 1]
+        return posterior_means + masks * (0.5 * posterior_log_variances).exp() * noises
+    def Get_Posterior(
+        self,
+        features: torch.Tensor,
+        encodings: torch.Tensor,
+        diffusion_steps: torch.Tensor
+        ):
+        noised_predictions = self.denoiser(
+            features= features,
+            encodings= encodings,
+            diffusion_steps= diffusion_steps
+            )
+        epsilons = \
+            features * self.sqrt_recip_alphas_cumprod[diffusion_steps][:, None, None] - \
+            noised_predictions * self.sqrt_recipm1_alphas_cumprod[diffusion_steps][:, None, None]
+        epsilons.clamp_(-1.0, 1.0)  # clipped
+        posterior_means = \
+            epsilons * self.posterior_mean_coef1[diffusion_steps][:, None, None] + \
+            features * self.posterior_mean_coef2[diffusion_steps][:, None, None]
+        posterior_log_variances = \
+            self.posterior_log_variance[diffusion_steps][:, None, None]
+        return posterior_means, posterior_log_variances
+    def Get_Noise_Epsilon_for_Train(
+        self,
+        features: torch.Tensor,
+        encodings: torch.Tensor,
+        diffusion_steps: torch.Tensor,
+        ):
+        noises = torch.randn_like(features)
+        noised_features = \
+            features * self.sqrt_alphas_cumprod[diffusion_steps][:, None, None] + \
+            noises * self.sqrt_one_minus_alphas_cumprod[diffusion_steps][:, None, None]
+        epsilons = self.denoiser(
+            features= noised_features,
+            encodings= encodings,
+            diffusion_steps= diffusion_steps
+            )
+        return noises, epsilons
+    def DDIM(
+        self,
+        encodings: torch.Tensor,
+        ddim_steps: int,
+        eta: float= 0.0,
+        temperature: float= 1.0,
+        use_tqdm: bool= False
+        ):
+        ddim_timesteps = self.Get_DDIM_Steps(
+            ddim_steps= ddim_steps
+            )
+        sigmas, alphas, alphas_prev = self.Get_DDIM_Sampling_Parameters(
+            ddim_timesteps= ddim_timesteps,
+            eta= eta
+            )
+        sqrt_one_minus_alphas = (1. - alphas).sqrt()
+        features = torch.randn(
+            size= (encodings.size(0), self.feature_size, encodings.size(2)),
+            device= encodings.device
+            )
+        setp_range = reversed(range(ddim_steps))
+        if use_tqdm:
+            tqdm(
+                setp_range,
+                desc= '[Diffusion]',
+                total= ddim_steps
+                )
+        for diffusion_steps in setp_range:
+            noised_predictions = self.denoiser(
+                features= features,
+                encodings= encodings,
+                diffusion_steps= torch.full(
+                    size= (encodings.size(0), ),
+                    fill_value= diffusion_steps,
+                    dtype= torch.long,
+                    device= encodings.device
+                    )
+                )
+            feature_starts = (features - sqrt_one_minus_alphas[diffusion_steps] * noised_predictions) / alphas[diffusion_steps].sqrt()
+            direction_pointings = (1.0 - alphas_prev[diffusion_steps] - sigmas[diffusion_steps].pow(2.0)) * noised_predictions
+            noises = sigmas[diffusion_steps] * torch.randn_like(features) * temperature
+            features = alphas_prev[diffusion_steps].sqrt() * feature_starts + direction_pointings + noises
+        return features
+    # https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py
+    def Get_DDIM_Steps(
+        self,
+        ddim_steps: int,
+        ddim_discr_method: str= 'uniform'
+        ):
+        if ddim_discr_method == 'uniform':
+            ddim_timesteps = torch.arange(0, self.timesteps, self.timesteps // ddim_steps).long()
+        elif ddim_discr_method == 'quad':
+            ddim_timesteps = torch.linspace(0, (torch.tensor(self.timesteps) * 0.8).sqrt(), ddim_steps).pow(2.0).long()
+        else:
+            raise NotImplementedError(f'There is no ddim discretization method called "{ddim_discr_method}"')
+        ddim_timesteps[-1] = self.timesteps - 1
+        return ddim_timesteps
+    def Get_DDIM_Sampling_Parameters(self, ddim_timesteps, eta):
+        alphas = self.alphas_cumprod[ddim_timesteps]
+        alphas_prev = self.alphas_cumprod_prev[ddim_timesteps]
+        sigmas = eta * ((1 - alphas_prev) / (1 - alphas) * (1 - alphas / alphas_prev)).sqrt()
+        return sigmas, alphas, alphas_prev
+class Denoiser(torch.nn.Module):
+    def __init__(
+        self,
+        hyper_parameters: Namespace
+        ):
+        super().__init__()
+        self.hp = hyper_parameters
+        if self.hp.Feature_Type == 'Mel':
+            feature_size = self.hp.Sound.Mel_Dim
+        elif self.hp.Feature_Type == 'Spectrogram':
+            feature_size = self.hp.Sound.N_FFT // 2 + 1
+        self.prenet = torch.nn.Sequential(
+            Conv1d(
+                in_channels= feature_size,
+                out_channels= self.hp.Diffusion.Size,
+                kernel_size= 1,
+                w_init_gain= 'relu'
+                ),
+            torch.nn.Mish()
+            )
+        self.step_ffn = torch.nn.Sequential(
+            Diffusion_Embedding(
+                channels= self.hp.Diffusion.Size
+                ),
+            Lambda(lambda x: x.unsqueeze(2)),
+            Conv1d(
+                in_channels= self.hp.Diffusion.Size,
+                out_channels= self.hp.Diffusion.Size * 4,
+                kernel_size= 1,
+                w_init_gain= 'relu'
+                ),
+            torch.nn.Mish(),
+            Conv1d(
+                in_channels= self.hp.Diffusion.Size * 4,
+                out_channels= self.hp.Diffusion.Size,
+                kernel_size= 1,
+                w_init_gain= 'linear'
+                )
+            )
+        self.residual_blocks = torch.nn.ModuleList([
+            Residual_Block(
+                in_channels= self.hp.Diffusion.Size,
+                kernel_size= self.hp.Diffusion.Kernel_Size,
+                condition_channels= self.hp.Encoder.Size + feature_size
+                )
+            for _ in range(self.hp.Diffusion.Stack)
+            ])
+        self.projection =  torch.nn.Sequential(
+            Conv1d(
+                in_channels= self.hp.Diffusion.Size,
+                out_channels= self.hp.Diffusion.Size,
+                kernel_size= 1,
+                w_init_gain= 'relu'
+                ),
+            torch.nn.ReLU(),
+            Conv1d(
+                in_channels= self.hp.Diffusion.Size,
+                out_channels= feature_size,
+                kernel_size= 1
+                ),
+            )
+        torch.nn.init.zeros_(self.projection[-1].weight)    # This is key factor....
+    def forward(
+        self,
+        features: torch.Tensor,
+        encodings: torch.Tensor,
+        diffusion_steps: torch.Tensor
+        ):
+        '''
+        features: [Batch, Feature_d, Feature_t]
+        encodings: [Batch, Enc_d, Feature_t]
+        diffusion_steps: [Batch]
+        '''
+        x = self.prenet(features)
+        diffusion_steps = self.step_ffn(diffusion_steps) # [Batch, Res_d, 1]
+        skips_list = []
+        for residual_block in self.residual_blocks:
+            x, skips = residual_block(
+                x= x,
+                conditions= encodings,
+                diffusion_steps= diffusion_steps
+                )
+            skips_list.append(skips)
+        x = torch.stack(skips_list, dim= 0).sum(dim= 0) / math.sqrt(self.hp.Diffusion.Stack)
+        x = self.projection(x)
+        return x
+class Diffusion_Embedding(torch.nn.Module):
+    def __init__(
+        self,
+        channels: int
+        ):
+        super().__init__()
+        self.channels = channels
+    def forward(self, x: torch.Tensor):
+        half_channels = self.channels // 2  # sine and cosine
+        embeddings = math.log(10000.0) / (half_channels - 1)
+        embeddings = torch.exp(torch.arange(half_channels, device= x.device) * -embeddings)
+        embeddings = x.unsqueeze(1) * embeddings.unsqueeze(0)
+        embeddings = torch.cat([embeddings.sin(), embeddings.cos()], dim= -1)
+        return embeddings
+class Residual_Block(torch.nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        kernel_size: int,
+        condition_channels: int
+        ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.condition = Conv1d(
+            in_channels= condition_channels,
+            out_channels= in_channels * 2,
+            kernel_size= 1
+            )
+        self.diffusion_step = Conv1d(
+            in_channels= in_channels,
+            out_channels= in_channels,
+            kernel_size= 1
+            )
+        self.conv = Conv1d(
+            in_channels= in_channels,
+            out_channels= in_channels * 2,
+            kernel_size= kernel_size,
+            padding= kernel_size // 2
+            )
+        self.projection = Conv1d(
+            in_channels= in_channels,
+            out_channels= in_channels * 2,
+            kernel_size= 1
+            )
+    def forward(
+        self,
+        x: torch.Tensor,
+        conditions: torch.Tensor,
+        diffusion_steps: torch.Tensor
+        ):
+        residuals = x
+        conditions = self.condition(conditions)
+        diffusion_steps = self.diffusion_step(diffusion_steps)
+        x = self.conv(x + diffusion_steps) + conditions
+        x_a, x_b = x.chunk(chunks= 2, dim= 1)
+        x = x_a.sigmoid() * x_b.tanh()
+        x = self.projection(x)
+        x, skips = x.chunk(chunks= 2, dim= 1)
+        return (x + residuals) / math.sqrt(2.0), skips

Modules/Layer.py ADDED Viewed

	@@ -0,0 +1,317 @@

+import torch
+class Conv1d(torch.nn.Conv1d):
+    def __init__(self, w_init_gain= 'linear', *args, **kwargs):
+        self.w_init_gain = w_init_gain
+        super().__init__(*args, **kwargs)
+    def reset_parameters(self):
+        if self.w_init_gain in ['zero']:
+            torch.nn.init.zeros_(self.weight)
+        elif self.w_init_gain is None:
+            pass
+        elif self.w_init_gain in ['relu', 'leaky_relu']:
+            torch.nn.init.kaiming_uniform_(self.weight, nonlinearity= self.w_init_gain)
+        elif self.w_init_gain == 'glu':
+            assert self.out_channels % 2 == 0, 'The out_channels of GLU requires even number.'
+            torch.nn.init.kaiming_uniform_(self.weight[:self.out_channels // 2], nonlinearity= 'linear')
+            torch.nn.init.xavier_uniform_(self.weight[self.out_channels // 2:], gain= torch.nn.init.calculate_gain('sigmoid'))
+        elif self.w_init_gain == 'gate':
+            assert self.out_channels % 2 == 0, 'The out_channels of GLU requires even number.'
+            torch.nn.init.xavier_uniform_(self.weight[:self.out_channels // 2], gain= torch.nn.init.calculate_gain('tanh'))
+            torch.nn.init.xavier_uniform_(self.weight[self.out_channels // 2:], gain= torch.nn.init.calculate_gain('sigmoid'))
+        else:
+            torch.nn.init.xavier_uniform_(self.weight, gain= torch.nn.init.calculate_gain(self.w_init_gain))
+        if not self.bias is None:
+            torch.nn.init.zeros_(self.bias)
+class ConvTranspose1d(torch.nn.ConvTranspose1d):
+    def __init__(self, w_init_gain= 'linear', *args, **kwargs):
+        self.w_init_gain = w_init_gain
+        super().__init__(*args, **kwargs)
+    def reset_parameters(self):
+        if self.w_init_gain in ['zero']:
+            torch.nn.init.zeros_(self.weight)
+        elif self.w_init_gain in ['relu', 'leaky_relu']:
+            torch.nn.init.kaiming_uniform_(self.weight, nonlinearity= self.w_init_gain)
+        elif self.w_init_gain == 'glu':
+            assert self.out_channels % 2 == 0, 'The out_channels of GLU requires even number.'
+            torch.nn.init.kaiming_uniform_(self.weight[:self.out_channels // 2], nonlinearity= 'linear')
+            torch.nn.init.xavier_uniform_(self.weight[self.out_channels // 2:], gain= torch.nn.init.calculate_gain('sigmoid'))
+        elif self.w_init_gain == 'gate':
+            assert self.out_channels % 2 == 0, 'The out_channels of GLU requires even number.'
+            torch.nn.init.xavier_uniform_(self.weight[:self.out_channels // 2], gain= torch.nn.init.calculate_gain('tanh'))
+            torch.nn.init.xavier_uniform_(self.weight[self.out_channels // 2:], gain= torch.nn.init.calculate_gain('sigmoid'))
+        else:
+            torch.nn.init.xavier_uniform_(self.weight, gain= torch.nn.init.calculate_gain(self.w_init_gain))
+        if not self.bias is None:
+            torch.nn.init.zeros_(self.bias)
+class Conv2d(torch.nn.Conv2d):
+    def __init__(self, w_init_gain= 'linear', *args, **kwargs):
+        self.w_init_gain = w_init_gain
+        super().__init__(*args, **kwargs)
+    def reset_parameters(self):
+        if self.w_init_gain in ['zero']:
+            torch.nn.init.zeros_(self.weight)
+        elif self.w_init_gain in ['relu', 'leaky_relu']:
+            torch.nn.init.kaiming_uniform_(self.weight, nonlinearity= self.w_init_gain)
+        elif self.w_init_gain == 'glu':
+            assert self.out_channels % 2 == 0, 'The out_channels of GLU requires even number.'
+            torch.nn.init.kaiming_uniform_(self.weight[:self.out_channels // 2], nonlinearity= 'linear')
+            torch.nn.init.xavier_uniform_(self.weight[self.out_channels // 2:], gain= torch.nn.init.calculate_gain('sigmoid'))
+        elif self.w_init_gain == 'gate':
+            assert self.out_channels % 2 == 0, 'The out_channels of GLU requires even number.'
+            torch.nn.init.xavier_uniform_(self.weight[:self.out_channels // 2], gain= torch.nn.init.calculate_gain('tanh'))
+            torch.nn.init.xavier_uniform_(self.weight[self.out_channels // 2:], gain= torch.nn.init.calculate_gain('sigmoid'))
+        else:
+            torch.nn.init.xavier_uniform_(self.weight, gain= torch.nn.init.calculate_gain(self.w_init_gain))
+        if not self.bias is None:
+            torch.nn.init.zeros_(self.bias)
+class ConvTranspose2d(torch.nn.ConvTranspose2d):
+    def __init__(self, w_init_gain= 'linear', *args, **kwargs):
+        self.w_init_gain = w_init_gain
+        super().__init__(*args, **kwargs)
+    def reset_parameters(self):
+        if self.w_init_gain in ['zero']:
+            torch.nn.init.zeros_(self.weight)
+        elif self.w_init_gain in ['relu', 'leaky_relu']:
+            torch.nn.init.kaiming_uniform_(self.weight, nonlinearity= self.w_init_gain)
+        elif self.w_init_gain == 'glu':
+            assert self.out_channels % 2 == 0, 'The out_channels of GLU requires even number.'
+            torch.nn.init.kaiming_uniform_(self.weight[:self.out_channels // 2], nonlinearity= 'linear')
+            torch.nn.init.xavier_uniform_(self.weight[self.out_channels // 2:], gain= torch.nn.init.calculate_gain('sigmoid'))
+        elif self.w_init_gain == 'gate':
+            assert self.out_channels % 2 == 0, 'The out_channels of GLU requires even number.'
+            torch.nn.init.xavier_uniform_(self.weight[:self.out_channels // 2], gain= torch.nn.init.calculate_gain('tanh'))
+            torch.nn.init.xavier_uniform_(self.weight[self.out_channels // 2:], gain= torch.nn.init.calculate_gain('sigmoid'))
+        else:
+            torch.nn.init.xavier_uniform_(self.weight, gain= torch.nn.init.calculate_gain(self.w_init_gain))
+        if not self.bias is None:
+            torch.nn.init.zeros_(self.bias)
+class Linear(torch.nn.Linear):
+    def __init__(self, w_init_gain= 'linear', *args, **kwargs):
+        self.w_init_gain = w_init_gain
+        super().__init__(*args, **kwargs)
+    def reset_parameters(self):
+        if self.w_init_gain in ['zero']:
+            torch.nn.init.zeros_(self.weight)
+        elif self.w_init_gain in ['relu', 'leaky_relu']:
+            torch.nn.init.kaiming_uniform_(self.weight, nonlinearity= self.w_init_gain)
+        elif self.w_init_gain == 'glu':
+            assert self.out_channels % 2 == 0, 'The out_channels of GLU requires even number.'
+            torch.nn.init.kaiming_uniform_(self.weight[:self.out_channels // 2], nonlinearity= 'linear')
+            torch.nn.init.xavier_uniform_(self.weight[self.out_channels // 2:], gain= torch.nn.init.calculate_gain('sigmoid'))
+        else:
+            torch.nn.init.xavier_uniform_(self.weight, gain= torch.nn.init.calculate_gain(self.w_init_gain))
+        if not self.bias is None:
+            torch.nn.init.zeros_(self.bias)
+class Lambda(torch.nn.Module):
+    def __init__(self, lambd):
+        super().__init__()
+        self.lambd = lambd
+    def forward(self, x):
+        return self.lambd(x)
+class Residual(torch.nn.Module):
+    def __init__(self, module):
+        super().__init__()
+        self.module = module
+    def forward(self, *args, **kwargs):
+        return self.module(*args, **kwargs)
+class LayerNorm(torch.nn.Module):
+    def __init__(self, num_features: int, eps: float= 1e-5):
+        super().__init__()
+        self.eps = eps
+        self.gamma = torch.nn.Parameter(torch.ones(num_features))
+        self.beta = torch.nn.Parameter(torch.zeros(num_features))
+    def forward(self, inputs: torch.Tensor):
+        means = inputs.mean(dim= 1, keepdim= True)
+        variances = (inputs - means).pow(2.0).mean(dim= 1, keepdim= True)
+        x = (inputs - means) * (variances + self.eps).rsqrt()
+        shape = [1, -1] + [1] * (x.ndim - 2)
+        return x * self.gamma.view(*shape) + self.beta.view(*shape)
+class LightweightConv1d(torch.nn.Module):
+    '''
+    Args:
+        input_size: # of channels of the input and output
+        kernel_size: convolution channels
+        padding: padding
+        num_heads: number of heads used. The weight is of shape
+            `(num_heads, 1, kernel_size)`
+        weight_softmax: normalize the weight with softmax before the convolution
+    Shape:
+        Input: BxCxT, i.e. (batch_size, input_size, timesteps)
+        Output: BxCxT, i.e. (batch_size, input_size, timesteps)
+    Attributes:
+        weight: the learnable weights of the module of shape
+            `(num_heads, 1, kernel_size)`
+        bias: the learnable bias of the module of shape `(input_size)`
+    '''
+    def __init__(
+        self,
+        input_size,
+        kernel_size=1,
+        padding=0,
+        num_heads=1,
+        weight_softmax=False,
+        bias=False,
+        weight_dropout=0.0,
+        w_init_gain= 'linear'
+    ):
+        super().__init__()
+        self.input_size = input_size
+        self.kernel_size = kernel_size
+        self.num_heads = num_heads
+        self.padding = padding
+        self.weight_softmax = weight_softmax
+        self.weight = torch.nn.Parameter(torch.Tensor(num_heads, 1, kernel_size))
+        self.w_init_gain = w_init_gain
+        if bias:
+            self.bias = torch.nn.Parameter(torch.Tensor(input_size))
+        else:
+            self.bias = None
+        self.weight_dropout_module = FairseqDropout(
+            weight_dropout, module_name=self.__class__.__name__
+        )
+        self.reset_parameters()
+    def reset_parameters(self):
+        if self.w_init_gain in ['relu', 'leaky_relu']:
+            torch.nn.init.kaiming_uniform_(self.weight, nonlinearity= self.w_init_gain)
+        elif self.w_init_gain == 'glu':
+            assert self.out_channels % 2 == 0, 'The out_channels of GLU requires even number.'
+            torch.nn.init.kaiming_uniform_(self.weight[:self.out_channels // 2], nonlinearity= 'linear')
+            torch.nn.init.xavier_uniform_(self.weight[self.out_channels // 2:], gain= torch.nn.init.calculate_gain('sigmoid'))
+        else:
+            torch.nn.init.xavier_uniform_(self.weight, gain= torch.nn.init.calculate_gain(self.w_init_gain))
+        if not self.bias is None:
+            torch.nn.init.zeros_(self.bias)
+    def forward(self, input):
+        """
+        input size: B x C x T
+        output size: B x C x T
+        """
+        B, C, T = input.size()
+        H = self.num_heads
+        weight = self.weight
+        if self.weight_softmax:
+            weight = weight.softmax(dim=-1)
+        weight = self.weight_dropout_module(weight)
+        # Merge every C/H entries into the batch dimension (C = self.input_size)
+        # B x C x T -> (B * C/H) x H x T
+        # One can also expand the weight to C x 1 x K by a factor of C/H
+        # and do not reshape the input instead, which is slow though
+        input = input.view(-1, H, T)
+        output = torch.nn.functional.conv1d(input, weight, padding=self.padding, groups=self.num_heads)
+        output = output.view(B, C, T)
+        if self.bias is not None:
+            output = output + self.bias.view(1, -1, 1)
+        return output
+class FairseqDropout(torch.nn.Module):
+    def __init__(self, p, module_name=None):
+        super().__init__()
+        self.p = p
+        self.module_name = module_name
+        self.apply_during_inference = False
+    def forward(self, x, inplace: bool = False):
+        if self.training or self.apply_during_inference:
+            return torch.nn.functional.dropout(x, p=self.p, training=True, inplace=inplace)
+        else:
+            return x
+class LinearAttention(torch.nn.Module):
+    def __init__(
+        self,
+        channels: int,
+        calc_channels: int,
+        num_heads: int,
+        dropout_rate: float= 0.1,
+        use_scale: bool= True,
+        use_residual: bool= True,
+        use_norm: bool= True
+        ):
+        super().__init__()
+        assert calc_channels % num_heads == 0
+        self.calc_channels = calc_channels
+        self.num_heads = num_heads
+        self.use_scale = use_scale
+        self.use_residual = use_residual
+        self.use_norm = use_norm
+        self.prenet = Conv1d(
+            in_channels= channels,
+            out_channels= calc_channels * 3,
+            kernel_size= 1,
+            bias=False,
+            w_init_gain= 'linear'
+            )
+        self.projection = Conv1d(
+            in_channels= calc_channels,
+            out_channels= channels,
+            kernel_size= 1,
+            w_init_gain= 'linear'
+            )
+        self.dropout = torch.nn.Dropout(p= dropout_rate)
+        if use_scale:
+            self.scale = torch.nn.Parameter(torch.zeros(1))
+        if use_norm:
+            self.norm = LayerNorm(num_features= channels)
+    def forward(self, x: torch.Tensor, *args, **kwargs):
+        '''
+        x: [Batch, Enc_d, Enc_t]
+        '''
+        residuals = x
+        x = self.prenet(x)  # [Batch, Calc_d * 3, Enc_t]
+        x = x.view(x.size(0), self.num_heads, x.size(1) // self.num_heads, x.size(2))    # [Batch, Head, Calc_d // Head * 3, Enc_t]
+        queries, keys, values = x.chunk(chunks= 3, dim= 2)  # [Batch, Head, Calc_d // Head, Enc_t] * 3
+        keys = (keys + 1e-5).softmax(dim= 3)
+        contexts = keys @ values.permute(0, 1, 3, 2)   # [Batch, Head, Calc_d // Head, Calc_d // Head]
+        contexts = contexts.permute(0, 1, 3, 2) @ queries   # [Batch, Head, Calc_d // Head, Enc_t]
+        contexts = contexts.view(contexts.size(0), contexts.size(1) * contexts.size(2), contexts.size(3))   # [Batch, Calc_d, Enc_t]
+        contexts = self.projection(contexts)    # [Batch, Enc_d, Enc_t]
+        if self.use_scale:
+            contexts = self.scale * contexts
+        contexts = self.dropout(contexts)
+        if self.use_residual:
+            contexts = contexts + residuals
+        if self.use_norm:
+            contexts = self.norm(contexts)
+        return contexts

Modules/Modules.py ADDED Viewed

	@@ -0,0 +1,265 @@

+from argparse import Namespace
+import torch
+import math
+from typing import Union
+from .Layer import Conv1d, LayerNorm, LinearAttention
+from .Diffusion import Diffusion
+class DiffSinger(torch.nn.Module):
+    def __init__(self, hyper_parameters: Namespace):
+        super().__init__()
+        self.hp = hyper_parameters
+        self.encoder = Encoder(self.hp)
+        self.diffusion = Diffusion(self.hp)
+    def forward(
+        self,
+        tokens: torch.LongTensor,
+        notes: torch.LongTensor,
+        durations: torch.LongTensor,
+        lengths: torch.LongTensor,
+        genres: torch.LongTensor,
+        singers: torch.LongTensor,
+        features: Union[torch.FloatTensor, None]= None,
+        ddim_steps: Union[int, None]= None
+        ):
+        encodings, linear_predictions = self.encoder(
+            tokens= tokens,
+            notes= notes,
+            durations= durations,
+            lengths= lengths,
+            genres= genres,
+            singers= singers
+            )    # [Batch, Enc_d, Feature_t]
+        encodings = torch.cat([encodings, linear_predictions], dim= 1)  # [Batch, Enc_d + Feature_d, Feature_t]
+        if not features is None or ddim_steps is None or ddim_steps == self.hp.Diffusion.Max_Step:
+            diffusion_predictions, noises, epsilons = self.diffusion(
+                encodings= encodings,
+                features= features,
+                )
+        else:
+            noises, epsilons = None, None
+            diffusion_predictions = self.diffusion.DDIM(
+                encodings= encodings,
+                ddim_steps= ddim_steps
+                )
+        return linear_predictions, diffusion_predictions, noises, epsilons
+class Encoder(torch.nn.Module):
+    def __init__(
+        self,
+        hyper_parameters: Namespace
+        ):
+        super().__init__()
+        self.hp = hyper_parameters
+        if self.hp.Feature_Type == 'Mel':
+            self.feature_size = self.hp.Sound.Mel_Dim
+        elif self.hp.Feature_Type == 'Spectrogram':
+            self.feature_size = self.hp.Sound.N_FFT // 2 + 1
+        self.token_embedding = torch.nn.Embedding(
+            num_embeddings= self.hp.Tokens,
+            embedding_dim= self.hp.Encoder.Size
+            )
+        self.note_embedding = torch.nn.Embedding(
+            num_embeddings= self.hp.Notes,
+            embedding_dim= self.hp.Encoder.Size
+            )
+        self.duration_embedding = Duration_Positional_Encoding(
+            num_embeddings= self.hp.Durations,
+            embedding_dim= self.hp.Encoder.Size
+            )
+        self.genre_embedding = torch.nn.Embedding(
+            num_embeddings= self.hp.Genres,
+            embedding_dim= self.hp.Encoder.Size,
+            )
+        self.singer_embedding = torch.nn.Embedding(
+            num_embeddings= self.hp.Singers,
+            embedding_dim= self.hp.Encoder.Size,
+            )
+        torch.nn.init.xavier_uniform_(self.token_embedding.weight)
+        torch.nn.init.xavier_uniform_(self.note_embedding.weight)
+        torch.nn.init.xavier_uniform_(self.genre_embedding.weight)
+        torch.nn.init.xavier_uniform_(self.singer_embedding.weight)
+        self.fft_blocks = torch.nn.ModuleList([
+            FFT_Block(
+                channels= self.hp.Encoder.Size,
+                num_head= self.hp.Encoder.ConvFFT.Head,
+                ffn_kernel_size= self.hp.Encoder.ConvFFT.FFN.Kernel_Size,
+                dropout_rate= self.hp.Encoder.ConvFFT.Dropout_Rate
+                )
+            for _ in range(self.hp.Encoder.ConvFFT.Stack)
+            ])
+        self.linear_projection = Conv1d(
+            in_channels= self.hp.Encoder.Size,
+            out_channels= self.feature_size,
+            kernel_size= 1,
+            bias= True,
+            w_init_gain= 'linear'
+            )
+    def forward(
+        self,
+        tokens: torch.Tensor,
+        notes: torch.Tensor,
+        durations: torch.Tensor,
+        lengths: torch.Tensor,
+        genres: torch.Tensor,
+        singers: torch.Tensor
+        ):
+        x = \
+            self.token_embedding(tokens) + \
+            self.note_embedding(notes) + \
+            self.duration_embedding(durations) + \
+            self.genre_embedding(genres).unsqueeze(1) + \
+            self.singer_embedding(singers).unsqueeze(1)
+        x = x.permute(0, 2, 1)  # [Batch, Enc_d, Enc_t]
+        for block in self.fft_blocks:
+            x = block(x, lengths)   # [Batch, Enc_d, Enc_t]
+        linear_predictions = self.linear_projection(x)  # [Batch, Feature_d, Enc_t]
+        return x, linear_predictions
+class FFT_Block(torch.nn.Module):
+    def __init__(
+        self,
+        channels: int,
+        num_head: int,
+        ffn_kernel_size: int,
+        dropout_rate: float= 0.1,
+        ) -> None:
+        super().__init__()
+        self.attention = LinearAttention(
+            channels= channels,
+            calc_channels= channels,
+            num_heads= num_head,
+            dropout_rate= dropout_rate
+            )
+        self.ffn = FFN(
+            channels= channels,
+            kernel_size= ffn_kernel_size,
+            dropout_rate= dropout_rate
+            )
+    def forward(
+        self,
+        x: torch.Tensor,
+        lengths: torch.Tensor
+        ) -> torch.Tensor:
+        '''
+        x: [Batch, Dim, Time]
+        '''
+        masks = (~Mask_Generate(lengths= lengths, max_length= torch.ones_like(x[0, 0]).sum())).unsqueeze(1).float()   # float mask
+        # Attention + Dropout + LayerNorm
+        x = self.attention(x)
+        # FFN + Dropout + LayerNorm
+        x = self.ffn(x, masks)
+        return x * masks
+class FFN(torch.nn.Module):
+    def __init__(
+        self,
+        channels: int,
+        kernel_size: int,
+        dropout_rate: float= 0.1,
+        ) -> None:
+        super().__init__()
+        self.conv_0 = Conv1d(
+            in_channels= channels,
+            out_channels= channels,
+            kernel_size= kernel_size,
+            padding= (kernel_size - 1) // 2,
+            w_init_gain= 'relu'
+            )
+        self.relu = torch.nn.ReLU()
+        self.dropout = torch.nn.Dropout(p= dropout_rate)
+        self.conv_1 = Conv1d(
+            in_channels= channels,
+            out_channels= channels,
+            kernel_size= kernel_size,
+            padding= (kernel_size - 1) // 2,
+            w_init_gain= 'linear'
+            )
+        self.norm = LayerNorm(
+            num_features= channels,
+            )
+    def forward(
+        self,
+        x: torch.Tensor,
+        masks: torch.Tensor
+        ) -> torch.Tensor:
+        '''
+        x: [Batch, Dim, Time]
+        '''
+        residuals = x
+        x = self.conv_0(x * masks)
+        x = self.relu(x)
+        x = self.dropout(x)
+        x = self.conv_1(x * masks)
+        x = self.dropout(x)
+        x = self.norm(x + residuals)
+        return x * masks
+# https://pytorch.org/tutorials/beginner/transformer_tutorial.html
+# https://github.com/soobinseo/Transformer-TTS/blob/master/network.py
+class Duration_Positional_Encoding(torch.nn.Embedding):
+    def __init__(
+        self,
+        num_embeddings: int,
+        embedding_dim: int,
+        ):
+        positional_embedding = torch.zeros(num_embeddings, embedding_dim)
+        position = torch.arange(0, num_embeddings, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, embedding_dim, 2).float() * (-math.log(10000.0) / embedding_dim))
+        positional_embedding[:, 0::2] = torch.sin(position * div_term)
+        positional_embedding[:, 1::2] = torch.cos(position * div_term)
+        super().__init__(
+            num_embeddings= num_embeddings,
+            embedding_dim= embedding_dim,
+            _weight= positional_embedding
+            )
+        self.weight.requires_grad = False
+        self.alpha = torch.nn.Parameter(
+            data= torch.ones(1) * 0.01,
+            requires_grad= True
+            )
+    def forward(self, durations):
+        '''
+        durations: [Batch, Length]
+        '''
+        return self.alpha * super().forward(durations)  # [Batch, Dim, Length]
+    @torch.jit.script
+    def get_pe(x: torch.Tensor, pe: torch.Tensor):
+        pe = pe.repeat(1, 1, math.ceil(x.size(2) / pe.size(2)))
+        return pe[:, :, :x.size(2)]
+def Mask_Generate(lengths: torch.Tensor, max_length: Union[torch.Tensor, int, None]= None):
+    '''
+    lengths: [Batch]
+    max_lengths: an int value. If None, max_lengths == max(lengths)
+    '''
+    max_length = max_length or torch.max(lengths)
+    sequence = torch.arange(max_length)[None, :].to(lengths.device)
+    return sequence >= lengths[:, None]    # [Batch, Time]

Pattern_Generator.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import numpy as np
+import mido, os, pickle, yaml, argparse, math, librosa, hgtk, logging
+from tqdm import tqdm
+from pysptk.sptk import rapt
+from typing import List, Tuple
+from argparse import Namespace  # for type
+import torch
+from typing import Dict
+from meldataset import mel_spectrogram, spectrogram, spec_energy
+from Arg_Parser import Recursive_Parse
+def Convert_Feature_Based_Music(
+    music: List[Tuple[float, str, int]],
+    sample_rate: int,
+    frame_shift: int,
+    consonant_duration: int= 3,
+    equality_duration: bool= False
+    ):
+    previous_used = 0
+    lyrics = []
+    notes = []
+    durations = []
+    for message_time, lyric, note in music:
+        duration = round(message_time * sample_rate) + previous_used
+        previous_used = duration % frame_shift
+        duration = duration // frame_shift
+        if lyric == '<X>':
+            lyrics.append(lyric)
+            notes.append(note)
+            durations.append(duration)
+        else:
+            lyrics.extend(Decompose(lyric))
+            notes.extend([note] * 3)
+            if equality_duration or duration < consonant_duration * 3:
+                split_duration = [duration // 3] * 3
+                split_duration[1] += duration % 3
+                durations.extend(split_duration)
+            else:
+                durations.extend([
+                    consonant_duration,    # onset
+                    duration - consonant_duration * 2, # nucleus
+                    consonant_duration # coda
+                    ])
+    return lyrics, notes, durations
+def Expand_by_Duration(
+    lyrics: List[str],
+    notes: List[int],
+    durations: List[int],
+    ):
+    lyrics = sum([[lyric] * duration for lyric, duration in zip(lyrics, durations)], [])
+    notes = sum([*[[note] * duration for note, duration in zip(notes, durations)]], [])
+    durations = [index for duration in durations for index in range(duration)]
+    return lyrics, notes, durations
+def Decompose(syllable: str):
+    onset, nucleus, coda = hgtk.letter.decompose(syllable)
+    coda += '_'
+    return onset, nucleus, coda

README.md CHANGED Viewed

@@ -1,8 +1,8 @@
 ---
-title: Diffsingerkr
-emoji: 😻
-colorFrom: indigo
-colorTo: yellow
 sdk: streamlit
 sdk_version: 1.17.0
 app_file: app.py

 ---
+title: Diffsvs
+emoji: 🐢
+colorFrom: blue
+colorTo: blue
 sdk: streamlit
 sdk_version: 1.17.0
 app_file: app.py

YAML/Genre_Info.yaml ADDED Viewed

	@@ -0,0 +1 @@


1	+ Children: 0

YAML/Log_Energy_Info.yaml ADDED Viewed

	@@ -0,0 +1,3 @@

+CSD:
+  Mean: 3.540642499923706
+  Std: 2.1372854709625244

YAML/Log_F0_Info.yaml ADDED Viewed

	@@ -0,0 +1,3 @@

+CSD:
+  Mean: 5.851496696472168
+  Std: 0.2526451647281647

YAML/Mel_Range_Info.yaml ADDED Viewed

	@@ -0,0 +1,3 @@

+CSD:
+  Max: 2.6226840019226074
+  Min: -11.512925148010254

YAML/Singer_Info.yaml ADDED Viewed

	@@ -0,0 +1 @@


1	+ CSD: 0

YAML/Spectrogram_Range_Info.yaml ADDED Viewed

	@@ -0,0 +1,3 @@

+CSD:
+  Max: 5.292316913604736
+  Min: -10.36163330078125

YAML/Token.yaml ADDED Viewed

	@@ -0,0 +1,71 @@

+<E>: 1
+<S>: 0
+<X>: 2
+_: 3
+"\u3131": 4
+"\u3131_": 5
+"\u3132": 6
+"\u3132_": 7
+"\u3133_": 8
+"\u3134": 9
+"\u3134_": 10
+"\u3135_": 11
+"\u3136_": 12
+"\u3137": 13
+"\u3137_": 14
+"\u3138": 15
+"\u3139": 16
+"\u3139_": 17
+"\u313A_": 18
+"\u313B_": 19
+"\u313C_": 20
+"\u313D_": 21
+"\u313E_": 22
+"\u313F_": 23
+"\u3140_": 24
+"\u3141": 25
+"\u3141_": 26
+"\u3142": 27
+"\u3142_": 28
+"\u3143": 29
+"\u3144_": 30
+"\u3145": 31
+"\u3145_": 32
+"\u3146": 33
+"\u3146_": 34
+"\u3147": 35
+"\u3147_": 36
+"\u3148": 37
+"\u3148_": 38
+"\u3149": 39
+"\u314A": 40
+"\u314A_": 41
+"\u314B": 42
+"\u314B_": 43
+"\u314C": 44
+"\u314C_": 45
+"\u314D": 46
+"\u314D_": 47
+"\u314E": 48
+"\u314E_": 49
+"\u314F": 50
+"\u3150": 51
+"\u3151": 52
+"\u3152": 53
+"\u3153": 54
+"\u3154": 55
+"\u3155": 56
+"\u3156": 57
+"\u3157": 58
+"\u3158": 59
+"\u3159": 60
+"\u315A": 61
+"\u315B": 62
+"\u315C": 63
+"\u315D": 64
+"\u315E": 65
+"\u315F": 66
+"\u3160": 67
+"\u3161": 68
+"\u3162": 69
+"\u3163": 70

app.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import streamlit as st
+from Inference import Inferencer
+def app_diffsingerkr():
+    if not 'diffsingerkr_duration' in st.session_state.keys():
+        st.session_state.diffsingerkr_duration = ''
+    if not 'diffsingerkr_lyric' in st.session_state.keys():
+        st.session_state.diffsingerkr_lyric = ''
+    if not 'diffsingerkr_note' in st.session_state.keys():
+        st.session_state.diffsingerkr_note = ''
+    if not 'inferencer' in st.session_state.keys():
+        st.session_state.inferencer = Inferencer(
+            hp_path= 'Hyper_Parameters.yaml',
+            checkpoint_path= 'Checkpoint/S_200000.pt',
+            batch_size= 1
+            )
+    st.title('DiffSinger-KR')
+    st.markdown('* This code is an implementation of DiffSinger for Korean.')
+    st.markdown('* When music score which is note, duration, and lyric information are entered, singing voices are synthesized accordingly.')
+    st.markdown('* Due to the range of the trained dataset, the supported notes are between 65 and 89.')
+    st.markdown('* Please refer to the [here](https://github.com/CODEJIN/DiffSingerKR) for the source code for training the model.')
+    st.markdown('''---''')
+    status_indicator = st.empty()
+    status_indicator.header('Insert the music!')
+    st.markdown('''---''')
+    example1_col, example2_col, example3_col, _ = st.columns(4)
+    if example1_col.button('Example 1'):
+        st.session_state.diffsingerkr_duration = '0.52,0.17,0.35,0.35,0.35,0.35,0.70,0.35,0.35,0.70,0.35,0.35,0.70,0.52,0.17,0.35,0.35,0.35,0.35,0.70,0.35,0.35,0.35,0.35,1.39'
+        st.session_state.diffsingerkr_lyric = '떴,다,떴,다,비,행,기,날,아,라,날,아,라,높,이,높,이,날,아,라,우,리,비,행,기'
+        st.session_state.diffsingerkr_note = '76,74,72,74,76,76,76,74,74,74,76,79,79,76,74,72,74,76,76,76,74,74,76,74,72'
+        st.experimental_rerun()
+    if example2_col.button('Example 2'):
+        st.session_state.diffsingerkr_duration = '0.53,0.52,0.50,0.57,0.58,0.46,0.48,0.50,0.37,0.13,0.43,0.21,0.57,0.43,0.49,1.44,0.26,0.49,0.14,0.13,0.57,0.26,0.06,0.15,0.63,0.26,0.51,0.20,0.48,0.72,0.22'
+        st.session_state.diffsingerkr_lyric = '만,나,고,<X>,난,외,로,움,을,<X>,알,았,어,내,겐,<X>,관,심,조,<X>,차,<X>,없,<X>,다,는,걸,<X>,알,면,서'
+        st.session_state.diffsingerkr_note = '76,78,79,0,71,74,72,71,72,0,71,69,69,71,74,0,79,78,79,0,71,0,74,0,74,72,72,0,71,71,69'
+        st.experimental_rerun()
+    if example3_col.button('Example 3'):
+        st.session_state.diffsingerkr_duration = '0.33,0.16,0.33,0.49,0.33,0.16,0.81,0.33,0.16,0.16,0.33,0.16,0.49,0.16,0.82,0.33,0.16,0.33,0.49,0.33,0.16,0.33,0.49,0.33,0.33,0.16,0.33,1.47,0.33,0.16,0.33,0.49,0.33,0.16,0.81,0.33,0.16,0.16,0.33,0.16,0.49,0.16,0.82,0.33,0.16,0.33,0.16,0.33,0.49,0.16,0.33,0.33,0.33,0.33,0.16,0.33,0.82'
+        st.session_state.diffsingerkr_lyric = '마,음,울,적,한,날,에,<X>,거,리,를,걸,어,보,고,향,기,로,운,칵,테,일,에,취,해,도,보,고,한,편,의,시,가,있,는,<X>,전,시,회,장,도,가,고,밤,새,도,<X>,록,그,리,움,에,편,질,쓰,고,파'
+        st.session_state.diffsingerkr_note = '80,80,80,87,85,84,82,0,84,84,84,85,84,79,79,77,77,77,80,80,78,77,75,77,80,79,80,82,80,80,80,87,85,84,82,0,84,84,84,85,84,79,79,77,77,77,79,80,80,77,75,75,77,80,79,82,80'
+        st.experimental_rerun()
+    st.markdown('''---''')
+    duration = st.text_input('Duration', value= st.session_state.diffsingerkr_duration)
+    lyric = st.text_input('Lyric', value= st.session_state.diffsingerkr_lyric)
+    note = st.text_input('Note', value= st.session_state.diffsingerkr_note)
+    singer = 'CSD'
+    genre = 'Children'
+    key_adjustment = st.select_slider(
+        label= 'Key adjustment',
+        options= [x for x in range(-6, 7)],
+        value= 0
+        )
+    if st.button("Generate!"):
+        if duration != '' and lyric != '' and note != '':
+            status_indicator.header('Generating...')
+            audio = st.session_state.inferencer.Inference_Epoch(
+                message_times_list= [[float(x) for x in duration.strip().split(',')]],
+                lyrics= [[x for x in lyric.strip().split(',')]],
+                notes= [[
+                    (int(x) + key_adjustment if int(x) != 0 else int(x))
+                    for x in note.strip().split(',')
+                    ]],
+                singers= [singer],
+                genres= [genre]
+                )[0]
+            st.audio(
+                audio,
+                format="audio/wav",
+                start_time=0,
+                sample_rate= st.session_state.inferencer.hp.Sound.Sample_Rate
+                )
+            status_indicator.header('Done.')
+if __name__ == '__main__':
+    app_diffsingerkr()

meldataset.py ADDED Viewed

	@@ -0,0 +1,230 @@

+###############################################################################
+# MIT License
+#
+# Copyright (c) 2020 Jungil Kong
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+###############################################################################
+import math
+import os
+import random
+import torch
+import torch.utils.data
+import numpy as np
+from librosa.util import normalize
+from scipy.io.wavfile import read
+from librosa.filters import mel as librosa_mel_fn
+MAX_WAV_VALUE = 32768.0
+def load_wav(full_path):
+    sampling_rate, data = read(full_path)
+    return data, sampling_rate
+def dynamic_range_compression(x, C=1, clip_val=1e-5):
+    return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
+def dynamic_range_decompression(x, C=1):
+    return np.exp(x) / C
+def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
+    return torch.log(torch.clamp(x, min=clip_val) * C)
+def dynamic_range_decompression_torch(x, C=1):
+    return torch.exp(x) / C
+def spectral_normalize_torch(magnitudes):
+    output = dynamic_range_compression_torch(magnitudes)
+    return output
+def spectral_de_normalize_torch(magnitudes):
+    output = dynamic_range_decompression_torch(magnitudes)
+    return output
+mel_basis = {}
+hann_window = {}
+def mel_spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
+    if torch.min(y) < -1.:
+        print('min value is ', torch.min(y))
+    if torch.max(y) > 1.:
+        print('max value is ', torch.max(y))
+    global mel_basis, hann_window
+    if fmax not in mel_basis:
+        mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
+        mel_basis[str(fmax)+'_'+str(y.device)] = torch.from_numpy(mel).float().to(y.device)
+        hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device)
+    y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
+    y = y.squeeze(1)
+    spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[str(y.device)],
+                      center=center, pad_mode='reflect', normalized=False, onesided=True)
+    spec = torch.sqrt(spec.pow(2).sum(-1)+(1e-9))
+    spec = torch.matmul(mel_basis[str(fmax)+'_'+str(y.device)], spec)
+    spec = spectral_normalize_torch(spec)
+    return spec
+def spectrogram(y, n_fft, hop_size, win_size, center=False):
+    if torch.min(y) < -1.:
+        print('min value is ', torch.min(y))
+    if torch.max(y) > 1.:
+        print('max value is ', torch.max(y))
+    global hann_window
+    hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device)
+    y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
+    y = y.squeeze(1)
+    spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[str(y.device)],
+                      center=center, pad_mode='reflect', normalized=False, onesided=True)
+    spec = torch.sqrt(spec.pow(2).sum(-1)+(1e-9))
+    spec = spectral_normalize_torch(spec)
+    return spec
+def spec_energy(y, n_fft, hop_size, win_size, center=False):
+    if torch.min(y) < -1.:
+        print('min value is ', torch.min(y))
+    if torch.max(y) > 1.:
+        print('max value is ', torch.max(y))
+    global hann_window
+    hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device)
+    y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
+    y = y.squeeze(1)
+    spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[str(y.device)],
+                      center=center, pad_mode='reflect', normalized=False, onesided=True)
+    spec = torch.sqrt(spec.pow(2).sum(-1)+(1e-9))
+    energy = torch.norm(spec, dim= 1)
+    return energy
+def get_dataset_filelist(a):
+    with open(a.input_training_file, 'r', encoding='utf-8') as fi:
+        training_files = [os.path.join(a.input_wavs_dir, x.split('|')[0] + '.wav')
+                          for x in fi.read().split('\n') if len(x) > 0]
+    with open(a.input_validation_file, 'r', encoding='utf-8') as fi:
+        validation_files = [os.path.join(a.input_wavs_dir, x.split('|')[0] + '.wav')
+                            for x in fi.read().split('\n') if len(x) > 0]
+    return training_files, validation_files
+class MelDataset(torch.utils.data.Dataset):
+    def __init__(self, training_files, segment_size, n_fft, num_mels,
+                 hop_size, win_size, sampling_rate,  fmin, fmax, split=True, shuffle=True, n_cache_reuse=1,
+                 device=None, fmax_loss=None, fine_tuning=False, base_mels_path=None):
+        self.audio_files = training_files
+        random.seed(1234)
+        if shuffle:
+            random.shuffle(self.audio_files)
+        self.segment_size = segment_size
+        self.sampling_rate = sampling_rate
+        self.split = split
+        self.n_fft = n_fft
+        self.num_mels = num_mels
+        self.hop_size = hop_size
+        self.win_size = win_size
+        self.fmin = fmin
+        self.fmax = fmax
+        self.fmax_loss = fmax_loss
+        self.cached_wav = None
+        self.n_cache_reuse = n_cache_reuse
+        self._cache_ref_count = 0
+        self.device = device
+        self.fine_tuning = fine_tuning
+        self.base_mels_path = base_mels_path
+    def __getitem__(self, index):
+        filename = self.audio_files[index]
+        if self._cache_ref_count == 0:
+            audio, sampling_rate = load_wav(filename)
+            audio = audio / MAX_WAV_VALUE
+            if not self.fine_tuning:
+                audio = normalize(audio) * 0.95
+            self.cached_wav = audio
+            if sampling_rate != self.sampling_rate:
+                raise ValueError("{} SR doesn't match target {} SR".format(
+                    sampling_rate, self.sampling_rate))
+            self._cache_ref_count = self.n_cache_reuse
+        else:
+            audio = self.cached_wav
+            self._cache_ref_count -= 1
+        audio = torch.FloatTensor(audio)
+        audio = audio.unsqueeze(0)
+        if not self.fine_tuning:
+            if self.split:
+                if audio.size(1) >= self.segment_size:
+                    max_audio_start = audio.size(1) - self.segment_size
+                    audio_start = random.randint(0, max_audio_start)
+                    audio = audio[:, audio_start:audio_start+self.segment_size]
+                else:
+                    audio = torch.nn.functional.pad(audio, (0, self.segment_size - audio.size(1)), 'constant')
+            mel = mel_spectrogram(audio, self.n_fft, self.num_mels,
+                                  self.sampling_rate, self.hop_size, self.win_size, self.fmin, self.fmax,
+                                  center=False)
+        else:
+            mel = np.load(
+                os.path.join(self.base_mels_path, os.path.splitext(os.path.split(filename)[-1])[0] + '.npy'))
+            mel = torch.from_numpy(mel)
+            if len(mel.shape) < 3:
+                mel = mel.unsqueeze(0)
+            if self.split:
+                frames_per_seg = math.ceil(self.segment_size / self.hop_size)
+                if audio.size(1) >= self.segment_size:
+                    mel_start = random.randint(0, mel.size(2) - frames_per_seg - 1)
+                    mel = mel[:, :, mel_start:mel_start + frames_per_seg]
+                    audio = audio[:, mel_start * self.hop_size:(mel_start + frames_per_seg) * self.hop_size]
+                else:
+                    mel = torch.nn.functional.pad(mel, (0, frames_per_seg - mel.size(2)), 'constant')
+                    audio = torch.nn.functional.pad(audio, (0, self.segment_size - audio.size(1)), 'constant')
+        mel_loss = mel_spectrogram(audio, self.n_fft, self.num_mels,
+                                   self.sampling_rate, self.hop_size, self.win_size, self.fmin, self.fmax_loss,
+                                   center=False)
+        return (mel.squeeze(), audio.squeeze(0), filename, mel_loss.squeeze())
+    def __len__(self):
+        return len(self.audio_files)

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+streamlit
+torch
+librosa
+mido
+hgtk
+pysptk
+matplotlib

vocoder.pts ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5b47a5d03d744861f94ee973294317f738ccc6dc6d27bafa5d8db5ed18f95566
+size 55884400