oscillate vits duration

Browse files

Files changed (16) hide show

Modules/hifigan.py +29 -82
Modules/vits/models.py +121 -862
README.md +1 -1
Utils/JDC/__init__.py +0 -1
Utils/JDC/bst.pth +0 -3
Utils/JDC/model.py +0 -190
Utils/PLBERT/util.py +1 -1
Utils/text_utils.py +101 -61
api.py +6 -16
audiobook.py +14 -27
demo.py +15 -43
live_demo.py +5 -7
models.py +167 -251
msinference.py +67 -139
requirements.txt +1 -1
tts.py +2 -2

Modules/hifigan.py CHANGED Viewed

@@ -2,12 +2,12 @@ import torch
 import torch.nn.functional as F
 import torch.nn as nn
 from torch.nn import Conv1d, ConvTranspose1d
-from torch.nn.utils import weight_norm, remove_weight_norm
 import math
 import numpy as np
-LRELU_SLOPE = 0.1
 def get_padding(kernel_size, dilation=1):
@@ -93,80 +93,38 @@ class AdaINResBlock1(torch.nn.Module):
             x = xt + x
         return x
-    def remove_weight_norm(self):
-        for l in self.convs1:
-            remove_weight_norm(l)
-        for l in self.convs2:
-            remove_weight_norm(l)
-class SineGen(torch.nn.Module):
-    def __init__(self,
-                 samp_rate=24000,
-                 upsample_scale=300,
-                 harmonic_num=8,  # HARDCODED due to nn.Linear() of SourceModuleHnNSF
-                 voiced_threshold=10):
-        super(SineGen, self).__init__()
-        self.harmonic_num = harmonic_num
-        self.sampling_rate = samp_rate
-        self.voiced_threshold = voiced_threshold
-        self.upsample_scale = upsample_scale
-    def _f02sine(self, f0_values):
-        # --
-        # 134 HIFI
-        # torch.Size([1, 145200, 9])
-        # torch.Size([1, 145200, 9]) torch.Size([1, 145200, 9]) HIFi
         # modulo of negative f0_values => -21 % 10 = 9 as -3*10 + 9 = 21 NOTICE THAT f0_values IS SIGNED
-        rad_values = (f0_values / self.sampling_rate) % 1
-        rad_values = torch.nn.functional.interpolate(rad_values.transpose(1, 2),
                                                      scale_factor=1/self.upsample_scale,
-                                                     mode="linear").transpose(1, 2)
         # 1.89 sounds also nice has woofer at punctuation
         phase = torch.cumsum(rad_values, dim=1) * 1.84 * np.pi
-        phase = torch.nn.functional.interpolate(phase.transpose(1, 2) * self.upsample_scale,
-                                                scale_factor=self.upsample_scale, mode="linear").transpose(1, 2)
-        sines = torch.sin(phase)
-        return sines
-    def forward(self, f0):
-        # print('____________________________________\nF0 F0\n', f0.abs().mean(), f0.mean(), f0.max(), f0.min())  # male voices sound less muffed via higher scaler in sine_waves
-        # f0 is already full length - [1, 142600, 1]
-        amplif = .0104 if f0.abs().mean() < 100 else .009  # vary amplif based on f0.abs().mean() - voice sensitive
-        fn = torch.multiply(f0, torch.FloatTensor(
-            [[range(1, self.harmonic_num + 2)]]).to(f0.device))  # [1, 145200, 9]
-        # .007  # very important effect DEFAULT=0.1  very sensitive to speaker - heuristically
-        sine_waves = self._f02sine(fn) * amplif  # .009
-        uv = (f0 > self.voiced_threshold).type(torch.float32)
-        return sine_waves * uv
-class SourceModuleHnNSF(torch.nn.Module):
-    def __init__(self, harmonic_num=8):
-        super(SourceModuleHnNSF, self).__init__()
-        self.l_sin_gen = SineGen()
-        # harmonic=8 is hard fixed due to this nn.Linear()
-        self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
-        self.l_tanh = torch.nn.Tanh()
-    def forward(self, x):
-        # print('   HNnSF', x.shape)  # why this is [1, 300, 1, 535800]
-        sine_wavs = self.l_sin_gen(x)
-        # This linear sums all 9 harmonics
-        sine_merge = self.l_tanh(self.l_linear(sine_wavs))
-        return sine_merge
 class Generator(torch.nn.Module):
@@ -239,7 +197,7 @@ class Generator(torch.nn.Module):
             x_source = self.noise_res[i](x_source, s)
             x = self.ups[i](x)
-            # print(x.min(), x.max(), x_source.min(), x_source.max())
             x = x + x_source
             xs = None
@@ -250,22 +208,12 @@ class Generator(torch.nn.Module):
                 else:
                     xs += self.resblocks[i*self.num_kernels+j](x, s)
             x = xs / self.num_kernels
-        x = x + (1 / self.alphas[i+1]) * (torch.sin(self.alphas[i+1] * x) ** 2)
         x = self.conv_post(x)
         x = torch.tanh(x)
         return x
-    def remove_weight_norm(self):
-        print('Removing weight norm...')
-        for l in self.ups:
-            remove_weight_norm(l)
-        for l in self.resblocks:
-            l.remove_weight_norm()
-        remove_weight_norm(self.conv_pre)
-        remove_weight_norm(self.conv_post)
 class AdainResBlk1d(nn.Module):
     # also used in ProsodyPredictor()
@@ -324,7 +272,7 @@ class UpSample1d(nn.Module):
         if self.layer_type == 'none':
             return x
         else:
-            return F.interpolate(x, scale_factor=2, mode='nearest')
 class Decoder(nn.Module):
@@ -361,11 +309,10 @@ class Decoder(nn.Module):
     def forward(self, asr=None, F0_curve=None, N=None, s=None):
-        # print('p', asr.shape, F0_curve.shape, N.shape)
         F0 = self.F0_conv(F0_curve)
         N = self.N_conv(N)
-        # print(asr.shape, F0.shape, N.shape, 'TF')
         x = torch.cat([asr, F0, N], axis=1)

 import torch.nn.functional as F
 import torch.nn as nn
 from torch.nn import Conv1d, ConvTranspose1d
+from torch.nn.utils.parametrizations import weight_norm
 import math
 import numpy as np
 def get_padding(kernel_size, dilation=1):
             x = xt + x
         return x
+class SourceModuleHnNSF(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.harmonic_num = 8
+        self.l_linear = torch.nn.Linear(self.harmonic_num + 1, 1)
+        self.upsample_scale = 300
+    def forward(self, x):
+        # --
+        x = torch.multiply(x, torch.FloatTensor(
+            [[range(1, self.harmonic_num + 2)]]).to(x.device))  # [1, 145200, 9]
         # modulo of negative f0_values => -21 % 10 = 9 as -3*10 + 9 = 21 NOTICE THAT f0_values IS SIGNED
+        rad_values = x / 25647 #).clamp(0, 1)
+        # rad_values = torch.where(torch.logical_or(rad_values < 0, rad_values > 1), 0.5, rad_values)
+        rad_values = rad_values % 1  # % of neg values
+        rad_values = F.interpolate(rad_values.transpose(1, 2),
                                                      scale_factor=1/self.upsample_scale,
+                                                     mode='linear').transpose(1, 2)
         # 1.89 sounds also nice has woofer at punctuation
         phase = torch.cumsum(rad_values, dim=1) * 1.84 * np.pi
+        phase = F.interpolate(phase.transpose(1, 2) * self.upsample_scale,
+                              scale_factor=self.upsample_scale, mode='linear').transpose(1, 2)
+        x = .009 * phase.sin()
+        # --
+        x = self.l_linear(x).tanh()
+        return x
 class Generator(torch.nn.Module):
             x_source = self.noise_res[i](x_source, s)
             x = self.ups[i](x)
             x = x + x_source
             xs = None
                 else:
                     xs += self.resblocks[i*self.num_kernels+j](x, s)
             x = xs / self.num_kernels
+        # x = x + (1 / self.alphas[i+1]) * (torch.sin(self.alphas[i+1] * x) ** 2)  # noisy
         x = self.conv_post(x)
         x = torch.tanh(x)
         return x
 class AdainResBlk1d(nn.Module):
     # also used in ProsodyPredictor()
         if self.layer_type == 'none':
             return x
         else:
+            return F.interpolate(x, scale_factor=2, mode='nearest-exact')
 class Decoder(nn.Module):
     def forward(self, asr=None, F0_curve=None, N=None, s=None):
         F0 = self.F0_conv(F0_curve)
         N = self.N_conv(N)
         x = torch.cat([asr, F0, N], axis=1)

Modules/vits/models.py CHANGED Viewed

@@ -1,15 +1,30 @@
 import math
 from dataclasses import dataclass
-from typing import Any, Optional, Tuple, Union
 import numpy as np
 import torch
-import torch.utils.checkpoint
 from torch import nn
-from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask
 from transformers.modeling_outputs import BaseModelOutput, ModelOutput
 from transformers.modeling_utils import PreTrainedModel
 from transformers.configuration_utils import PretrainedConfig
 class VitsConfig(PretrainedConfig):
@@ -74,11 +89,9 @@ class VitsConfig(PretrainedConfig):
         self.ffn_kernel_size = ffn_kernel_size
         self.flow_size = flow_size
         self.spectrogram_bins = spectrogram_bins
         self.initializer_range = initializer_range
         self.layer_norm_eps = layer_norm_eps
-        self.use_stochastic_duration_prediction = use_stochastic_duration_prediction
         self.num_speakers = num_speakers
         self.speaker_embedding_size = speaker_embedding_size
         self.upsample_initial_channel = upsample_initial_channel
@@ -92,7 +105,6 @@ class VitsConfig(PretrainedConfig):
         self.duration_predictor_flow_bins = duration_predictor_flow_bins
         self.duration_predictor_tail_bound = duration_predictor_tail_bound
         self.duration_predictor_kernel_size = duration_predictor_kernel_size
         self.duration_predictor_num_flows = duration_predictor_num_flows
         self.duration_predictor_filter_channels = duration_predictor_filter_channels
         self.prior_encoder_num_flows = prior_encoder_num_flows
@@ -100,8 +112,6 @@ class VitsConfig(PretrainedConfig):
         self.posterior_encoder_num_wavenet_layers = posterior_encoder_num_wavenet_layers
         self.wavenet_kernel_size = wavenet_kernel_size
         self.wavenet_dilation_rate = wavenet_dilation_rate
         self.noise_scale = noise_scale
         self.noise_scale_duration = noise_scale_duration
         self.sampling_rate = sampling_rate
@@ -121,183 +131,9 @@ class VitsTextEncoderOutput(ModelOutput):
     last_hidden_state: torch.FloatTensor = None
     prior_means: torch.FloatTensor = None
     prior_log_variances: torch.FloatTensor = None
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
-def _unconstrained_rational_quadratic_spline(
-    inputs,
-    unnormalized_widths,
-    unnormalized_heights,
-    unnormalized_derivatives,
-    reverse=False,
-    tail_bound=5.0,
-    min_bin_width=1e-3,
-    min_bin_height=1e-3,
-    min_derivative=1e-3,
-):
-    inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound)
-    outside_interval_mask = ~inside_interval_mask
-    outputs = torch.zeros_like(inputs)
-    constant = np.log(np.exp(1 - min_derivative) - 1)
-    unnormalized_derivatives = nn.functional.pad(unnormalized_derivatives, pad=(1, 1))
-    unnormalized_derivatives[..., 0] = constant
-    unnormalized_derivatives[..., -1] = constant
-    outputs[outside_interval_mask] = inputs[outside_interval_mask]
-    outputs[inside_interval_mask] = _rational_quadratic_spline(
-        inputs=inputs[inside_interval_mask],
-        unnormalized_widths=unnormalized_widths[inside_interval_mask, :],
-        unnormalized_heights=unnormalized_heights[inside_interval_mask, :],
-        unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :],
-        reverse=reverse,
-        tail_bound=tail_bound,
-        min_bin_width=min_bin_width,
-        min_bin_height=min_bin_height,
-        min_derivative=min_derivative,
-    )
-    return outputs
-def _rational_quadratic_spline(
-    inputs,
-    unnormalized_widths,
-    unnormalized_heights,
-    unnormalized_derivatives,
-    reverse,
-    tail_bound,
-    min_bin_width,
-    min_bin_height,
-    min_derivative,
-):
-    """
-    This transformation represents a monotonically increasing piecewise rational quadratic function. Unlike the
-    function `_unconstrained_rational_quadratic_spline`, the function behaves the same across the `tail_bound`.
-    Args:
-        inputs (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
-            Second half of the hidden-states input to the Vits convolutional flow module.
-        unnormalized_widths (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
-            First `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
-            layer in the convolutional flow module
-        unnormalized_heights (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
-            Second `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
-            layer in the convolutional flow module
-        unnormalized_derivatives (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
-            Third `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
-            layer in the convolutional flow module
-        reverse (`bool`):
-            Whether the model is being run in reverse mode.
-        tail_bound (`float`):
-            Upper and lower limit bound for the rational quadratic function. Outside of this `tail_bound`, the
-            transform behaves as an identity function.
-        min_bin_width (`float`):
-            Minimum bin value across the width dimension for the piecewise rational quadratic function.
-        min_bin_height (`float`):
-            Minimum bin value across the height dimension for the piecewise rational quadratic function.
-        min_derivative (`float`):
-            Minimum bin value across the derivatives for the piecewise rational quadratic function.
-    Returns:
-        outputs (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
-            Hidden-states as transformed by the piecewise rational quadratic function.
-        log_abs_det (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
-            Logarithm of the absolute value of the determinants corresponding to the `outputs`.
-    """
-    upper_bound = tail_bound
-    lower_bound = -tail_bound
-    if torch.min(inputs) < lower_bound or torch.max(inputs) > upper_bound:
-        raise ValueError("Input to a transform is not within its domain")
-    num_bins = unnormalized_widths.shape[-1]
-    if min_bin_width * num_bins > 1.0:
-        raise ValueError(f"Minimal bin width {min_bin_width} too large for the number of bins {num_bins}")
-    if min_bin_height * num_bins > 1.0:
-        raise ValueError(f"Minimal bin height {min_bin_height} too large for the number of bins {num_bins}")
-    widths = nn.functional.softmax(unnormalized_widths, dim=-1)
-    widths = min_bin_width + (1 - min_bin_width * num_bins) * widths
-    cumwidths = torch.cumsum(widths, dim=-1)
-    cumwidths = nn.functional.pad(cumwidths, pad=(1, 0), mode="constant", value=0.0)
-    cumwidths = (upper_bound - lower_bound) * cumwidths + lower_bound
-    cumwidths[..., 0] = lower_bound
-    cumwidths[..., -1] = upper_bound
-    widths = cumwidths[..., 1:] - cumwidths[..., :-1]
-    derivatives = min_derivative + nn.functional.softplus(unnormalized_derivatives)
-    heights = nn.functional.softmax(unnormalized_heights, dim=-1)
-    heights = min_bin_height + (1 - min_bin_height * num_bins) * heights
-    cumheights = torch.cumsum(heights, dim=-1)
-    cumheights = nn.functional.pad(cumheights, pad=(1, 0), mode="constant", value=0.0)
-    cumheights = (upper_bound - lower_bound) * cumheights + lower_bound
-    cumheights[..., 0] = lower_bound
-    cumheights[..., -1] = upper_bound
-    heights = cumheights[..., 1:] - cumheights[..., :-1]
-    bin_locations = cumheights if reverse else cumwidths
-    bin_locations[..., -1] += 1e-6
-    bin_idx = torch.sum(inputs[..., None] >= bin_locations, dim=-1) - 1
-    bin_idx = bin_idx[..., None]
-    input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0]
-    input_bin_widths = widths.gather(-1, bin_idx)[..., 0]
-    input_cumheights = cumheights.gather(-1, bin_idx)[..., 0]
-    delta = heights / widths
-    input_delta = delta.gather(-1, bin_idx)[..., 0]
-    input_derivatives = derivatives.gather(-1, bin_idx)[..., 0]
-    input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0]
-    input_heights = heights.gather(-1, bin_idx)[..., 0]
-    intermediate1 = input_derivatives + input_derivatives_plus_one - 2 * input_delta
-    if not reverse:
-        raise ValueError
-        # theta = (inputs - input_cumwidths) / input_bin_widths
-        # theta_one_minus_theta = theta * (1 - theta)
-        # numerator = input_heights * (input_delta * theta.pow(2) + input_derivatives * theta_one_minus_theta)
-        # denominator = input_delta + intermediate1 * theta_one_minus_theta
-        # outputs = input_cumheights + numerator / denominator
-        # derivative_numerator = input_delta.pow(2) * (
-        #     input_derivatives_plus_one * theta.pow(2)
-        #     + 2 * input_delta * theta_one_minus_theta
-        #     + input_derivatives * (1 - theta).pow(2)
-        # )
-        # log_abs_det = torch.log(derivative_numerator) - 2 * torch.log(denominator)
-        # return outputs, log_abs_det
-    else:
-        # find the roots of a quadratic equation
-        intermediate2 = inputs - input_cumheights
-        intermediate3 = intermediate2 * intermediate1
-        a = input_heights * (input_delta - input_derivatives) + intermediate3
-        b = input_heights * input_derivatives - intermediate3
-        c = -input_delta * intermediate2
-        discriminant = b.pow(2) - 4 * a * c
-        if not (discriminant >= 0).all():
-            raise RuntimeError(f"invalid discriminant {discriminant}")
-        root = (2 * c) / (-b - torch.sqrt(discriminant))
-        outputs = root * input_bin_widths + input_cumwidths
-        # theta_one_minus_theta = root * (1 - root)
-        # denominator = input_delta + intermediate1 * theta_one_minus_theta
-        # derivative_numerator = input_delta.pow(2) * (
-        #     input_derivatives_plus_one * root.pow(2)
-        #     + 2 * input_delta * theta_one_minus_theta
-        #     + input_derivatives * (1 - root).pow(2)
-        # )
-        # log_abs_det = torch.log(derivative_numerator) - 2 * torch.log(denominator)
-        return outputs #, -log_abs_det
 class VitsWaveNet(torch.nn.Module):
@@ -305,20 +141,14 @@ class VitsWaveNet(torch.nn.Module):
         super().__init__()
         self.hidden_size = config.hidden_size
         self.num_layers = num_layers
         self.in_layers = torch.nn.ModuleList()
         self.res_skip_layers = torch.nn.ModuleList()
-        if hasattr(nn.utils.parametrizations, "weight_norm"):
-            weight_norm = nn.utils.parametrizations.weight_norm
-        else:
-            weight_norm = nn.utils.weight_norm
-        if config.speaker_embedding_size != 0:
-            cond_layer = torch.nn.Conv1d(config.speaker_embedding_size, 2 * config.hidden_size * num_layers, 1)
-            self.cond_layer = weight_norm(cond_layer, name="weight")
         for i in range(num_layers):
             dilation = config.wavenet_dilation_rate**i
             padding = (config.wavenet_kernel_size * dilation - dilation) // 2
@@ -337,53 +167,36 @@ class VitsWaveNet(torch.nn.Module):
                 res_skip_channels = 2 * config.hidden_size
             else:
                 res_skip_channels = config.hidden_size
             res_skip_layer = torch.nn.Conv1d(config.hidden_size, res_skip_channels, 1)
             res_skip_layer = weight_norm(res_skip_layer, name="weight")
             self.res_skip_layers.append(res_skip_layer)
-    def forward(self, inputs, padding_mask, global_conditioning=None):
         outputs = torch.zeros_like(inputs)
         num_channels = torch.IntTensor([self.hidden_size])[0]
         for i in range(self.num_layers):
             in_act = self.in_layers[i](inputs)
             # global_states = torch.zeros_like(hidden_states)  # style ?
             # acts = fused_add_tanh_sigmoid_multiply(hidden_states, global_states, num_channels_tensor[0])
             # --
             # def fused_add_tanh_sigmoid_multiply(input_a, input_b, num_channels):
             # in_act = input_a #  + input_b
             t_act = torch.tanh(in_act[:, :num_channels, :])
             s_act = torch.sigmoid(in_act[:, num_channels:, :])
             acts = t_act * s_act
-            #
             res_skip_acts = self.res_skip_layers[i](acts)
             if i < self.num_layers - 1:
                 res_acts = res_skip_acts[:, : self.hidden_size, :]
-                inputs = (inputs + res_acts) * padding_mask
                 outputs = outputs + res_skip_acts[:, self.hidden_size :, :]
             else:
                 outputs = outputs + res_skip_acts
-        return outputs * padding_mask
-    def remove_weight_norm(self):
-        if self.speaker_embedding_size != 0:
-            torch.nn.utils.remove_weight_norm(self.cond_layer)
-        for layer in self.in_layers:
-            torch.nn.utils.remove_weight_norm(layer)
-        for layer in self.res_skip_layers:
-            torch.nn.utils.remove_weight_norm(layer)
@@ -425,22 +238,6 @@ class HifiGanResidualBlock(nn.Module):
     def get_padding(self, kernel_size, dilation=1):
         return (kernel_size * dilation - dilation) // 2
-    def apply_weight_norm(self):
-        weight_norm = nn.utils.weight_norm
-        if hasattr(nn.utils.parametrizations, "weight_norm"):
-            weight_norm = nn.utils.parametrizations.weight_norm
-        for layer in self.convs1:
-            weight_norm(layer)
-        for layer in self.convs2:
-            weight_norm(layer)
-    def remove_weight_norm(self):
-        for layer in self.convs1:
-            nn.utils.remove_weight_norm(layer)
-        for layer in self.convs2:
-            nn.utils.remove_weight_norm(layer)
     def forward(self, hidden_states):
         for conv1, conv2 in zip(self.convs1, self.convs2):
             residual = hidden_states
@@ -483,44 +280,18 @@ class VitsHifiGan(nn.Module):
             channels = config.upsample_initial_channel // (2 ** (i + 1))
             for kernel_size, dilation in zip(config.resblock_kernel_sizes, config.resblock_dilation_sizes):
                 self.resblocks.append(HifiGanResidualBlock(channels, kernel_size, dilation, config.leaky_relu_slope))
         self.conv_post = nn.Conv1d(channels, 1, kernel_size=7, stride=1, padding=3, bias=False)
-        if config.speaker_embedding_size != 0:
-            self.cond = nn.Conv1d(config.speaker_embedding_size, config.upsample_initial_channel, 1)
-    def apply_weight_norm(self):
-        weight_norm = nn.utils.weight_norm
-        if hasattr(nn.utils.parametrizations, "weight_norm"):
-            weight_norm = nn.utils.parametrizations.weight_norm
-        for layer in self.upsampler:
-            weight_norm(layer)
-        for layer in self.resblocks:
-            layer.apply_weight_norm()
-    def remove_weight_norm(self):
-        for layer in self.upsampler:
-            nn.utils.remove_weight_norm(layer)
-        for layer in self.resblocks:
-            layer.remove_weight_norm()
-    def forward(
-        self,
-        spectrogram,
-        global_conditioning=None):
         hidden_states = self.conv_pre(spectrogram)
         for i in range(self.num_upsamples):
             hidden_states = nn.functional.leaky_relu(hidden_states, self.config.leaky_relu_slope)
             hidden_states = self.upsampler[i](hidden_states)
             res_state = self.resblocks[i * self.num_kernels](hidden_states)
             for j in range(1, self.num_kernels):
                 res_state += self.resblocks[i * self.num_kernels + j](hidden_states)
             hidden_states = res_state / self.num_kernels
         hidden_states = nn.functional.leaky_relu(hidden_states)
         hidden_states = self.conv_post(hidden_states)
         waveform = torch.tanh(hidden_states)
@@ -531,27 +302,20 @@ class VitsResidualCouplingLayer(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.half_channels = config.flow_size // 2
         self.conv_pre = nn.Conv1d(self.half_channels, config.hidden_size, 1)
         self.wavenet = VitsWaveNet(config, num_layers=config.prior_encoder_num_wavenet_layers)
         self.conv_post = nn.Conv1d(config.hidden_size, self.half_channels, 1)
-    def forward(self, inputs, padding_mask, global_conditioning=None, reverse=False):
-        first_half, second_half = torch.split(inputs, [self.half_channels] * 2, dim=1)
-        hidden_states = self.conv_pre(first_half) * padding_mask
-        hidden_states = self.wavenet(hidden_states, padding_mask, global_conditioning)
-        mean = self.conv_post(hidden_states) * padding_mask
-        log_stddev = torch.zeros_like(mean)
-        if not reverse:
-            second_half = mean + second_half * torch.exp(log_stddev) * padding_mask
-            outputs = torch.cat([first_half, second_half], dim=1)
-            log_determinant = torch.sum(log_stddev, [1, 2])
-            return outputs, log_determinant
-        else:
-            second_half = (second_half - mean) * torch.exp(-log_stddev) * padding_mask
-            outputs = torch.cat([first_half, second_half], dim=1)
-            return outputs, None
 class VitsResidualCouplingBlock(nn.Module):
@@ -561,226 +325,20 @@ class VitsResidualCouplingBlock(nn.Module):
         for _ in range(config.prior_encoder_num_flows):
             self.flows.append(VitsResidualCouplingLayer(config))
-    def forward(self, inputs, padding_mask, global_conditioning=None, reverse=False):
-        if not reverse:
-            for flow in self.flows:
-                inputs, _ = flow(inputs, padding_mask, global_conditioning)
-                inputs = torch.flip(inputs, [1])
-        else:
-            for flow in reversed(self.flows):
-                inputs = torch.flip(inputs, [1])
-                inputs, _ = flow(inputs, padding_mask, global_conditioning, reverse=True)
-        return inputs
-class VitsDilatedDepthSeparableConv(nn.Module):
-    def __init__(self, config, dropout_rate=0.0):
-        super().__init__()
-        kernel_size = config.duration_predictor_kernel_size
-        channels = config.hidden_size
-        self.num_layers = config.depth_separable_num_layers
-        self.convs_dilated = nn.ModuleList()
-        self.convs_pointwise = nn.ModuleList()
-        self.norms_1 = nn.ModuleList()
-        self.norms_2 = nn.ModuleList()
-        for i in range(self.num_layers):
-            dilation = kernel_size**i
-            padding = (kernel_size * dilation - dilation) // 2
-            self.convs_dilated.append(
-                nn.Conv1d(
-                    in_channels=channels,
-                    out_channels=channels,
-                    kernel_size=kernel_size,
-                    groups=channels,
-                    dilation=dilation,
-                    padding=padding,
-                )
-            )
-            self.convs_pointwise.append(nn.Conv1d(channels, channels, 1))
-            self.norms_1.append(nn.LayerNorm(channels))
-            self.norms_2.append(nn.LayerNorm(channels))
-    def forward(self, inputs, padding_mask, global_conditioning=None):
-        if global_conditioning is not None:
-            inputs = inputs + global_conditioning
-        for i in range(self.num_layers):
-            hidden_states = self.convs_dilated[i](inputs * padding_mask)
-            hidden_states = self.norms_1[i](hidden_states.transpose(1, -1)).transpose(1, -1)
-            hidden_states = nn.functional.gelu(hidden_states)
-            hidden_states = self.convs_pointwise[i](hidden_states)
-            hidden_states = self.norms_2[i](hidden_states.transpose(1, -1)).transpose(1, -1)
-            hidden_states = nn.functional.gelu(hidden_states)
-            inputs = inputs + hidden_states
-        return inputs * padding_mask
-class VitsConvFlow(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.filter_channels = config.hidden_size
-        self.half_channels = config.depth_separable_channels // 2
-        self.num_bins = config.duration_predictor_flow_bins
-        self.tail_bound = config.duration_predictor_tail_bound
-        self.conv_pre = nn.Conv1d(self.half_channels, self.filter_channels, 1)
-        self.conv_dds = VitsDilatedDepthSeparableConv(config)
-        self.conv_proj = nn.Conv1d(self.filter_channels, self.half_channels * (self.num_bins * 3 - 1), 1)
-    def forward(self, inputs, padding_mask, global_conditioning=None, reverse=False):
-        first_half, second_half = torch.split(inputs, [self.half_channels] * 2, dim=1)
-        hidden_states = self.conv_pre(first_half)
-        hidden_states = self.conv_dds(hidden_states, padding_mask, global_conditioning)
-        hidden_states = self.conv_proj(hidden_states) * padding_mask
-        batch_size, channels, length = first_half.shape
-        hidden_states = hidden_states.reshape(batch_size, channels, -1, length).permute(0, 1, 3, 2)
-        unnormalized_widths = hidden_states[..., : self.num_bins] / math.sqrt(self.filter_channels)
-        unnormalized_heights = hidden_states[..., self.num_bins : 2 * self.num_bins] / math.sqrt(self.filter_channels)
-        unnormalized_derivatives = hidden_states[..., 2 * self.num_bins :]
-        second_half = _unconstrained_rational_quadratic_spline(
-            second_half,
-            unnormalized_widths,
-            unnormalized_heights,
-            unnormalized_derivatives,
-            reverse=reverse,
-            tail_bound=self.tail_bound,
-        )
-        outputs = torch.cat([first_half, second_half], dim=1) * padding_mask
-        return outputs, None
-class VitsElementwiseAffine(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.channels = config.depth_separable_channels
-        self.translate = nn.Parameter(torch.zeros(self.channels, 1))
-        self.log_scale = nn.Parameter(torch.zeros(self.channels, 1))
-    def forward(self, inputs, padding_mask, global_conditioning=None, reverse=False):
-        if not reverse:
-            raise ValueError
-            # outputs = self.translate + torch.exp(self.log_scale) * inputs
-            # outputs = outputs * padding_mask
-            # log_determinant = torch.sum(self.log_scale * padding_mask, [1, 2])
-            # return outputs, log_determinant
-        else:
-            outputs = (inputs - self.translate) * torch.exp(-self.log_scale) * padding_mask
-            return outputs, None
-class VitsStochasticDurationPredictor(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        embed_dim = config.speaker_embedding_size
-        filter_channels = config.hidden_size
-        self.conv_pre = nn.Conv1d(filter_channels, filter_channels, 1)
-        self.conv_proj = nn.Conv1d(filter_channels, filter_channels, 1)
-        self.conv_dds = VitsDilatedDepthSeparableConv(config)
-        if embed_dim != 0:
-            self.cond = nn.Conv1d(embed_dim, filter_channels, 1)
-        self.flows = nn.ModuleList()
-        self.flows.append(VitsElementwiseAffine(config))
-        for _ in range(config.duration_predictor_num_flows):
-            self.flows.append(VitsConvFlow(config))
-        # self.post_conv_pre = nn.Conv1d(1, filter_channels, 1)
-        # self.post_conv_proj = nn.Conv1d(filter_channels, filter_channels, 1)
-        # self.post_conv_dds = VitsDilatedDepthSeparableConv(
-        #     config,
-        #     dropout_rate=config.duration_predictor_dropout,
-        # )
-        # self.post_flows = nn.ModuleList()
-        # self.post_flows.append(VitsElementwiseAffine(config))
-        # for _ in range(config.duration_predictor_num_flows):
-        #     self.post_flows.append(VitsConvFlow(config))
-    def forward(self, inputs, padding_mask, global_conditioning=None, durations=None, reverse=False, noise_scale=1.0):
-        inputs = torch.detach(inputs)
-        inputs = self.conv_pre(inputs)
-        if global_conditioning is not None:
-            raise ValueError
-            # global_conditioning = torch.detach(global_conditioning)
-            # inputs = inputs + self.cond(global_conditioning)
-        inputs = self.conv_dds(inputs, padding_mask)
-        inputs = self.conv_proj(inputs) * padding_mask
-        if not reverse:
-            raise ValueError
-            # hidden_states = self.post_conv_pre(durations)
-            # hidden_states = self.post_conv_dds(hidden_states, padding_mask)
-            # hidden_states = self.post_conv_proj(hidden_states) * padding_mask
-            # random_posterior = (
-            #     torch.randn(durations.size(0), 2, durations.size(2)).to(device=inputs.device, dtype=inputs.dtype)
-            #     * padding_mask
-            # )
-            # log_determinant_posterior_sum = 0
-            # latents_posterior = random_posterior
-            # for flow in self.post_flows:
-            #     latents_posterior, log_determinant = flow(
-            #         latents_posterior, padding_mask, global_conditioning=inputs + hidden_states
-            #     )
-            #     latents_posterior = torch.flip(latents_posterior, [1])
-            #     log_determinant_posterior_sum += log_determinant
-            # first_half, second_half = torch.split(latents_posterior, [1, 1], dim=1)
-            # log_determinant_posterior_sum += torch.sum(
-            #     (nn.functional.logsigmoid(first_half) + nn.functional.logsigmoid(-first_half)) * padding_mask, [1, 2]
-            # )
-            # logq = (
-            #     torch.sum(-0.5 * (math.log(2 * math.pi) + (random_posterior**2)) * padding_mask, [1, 2])
-            #     - log_determinant_posterior_sum
-            # )
-            # first_half = (durations - torch.sigmoid(first_half)) * padding_mask
-            # first_half = torch.log(torch.clamp_min(first_half, 1e-5)) * padding_mask
-            # log_determinant_sum = torch.sum(-first_half, [1, 2])
-            # latents = torch.cat([first_half, second_half], dim=1)
-            # for flow in self.flows:
-            #     latents, log_determinant = flow(latents, padding_mask, global_conditioning=inputs)
-            #     latents = torch.flip(latents, [1])
-            #     log_determinant_sum += log_determinant
-            # nll = torch.sum(0.5 * (math.log(2 * math.pi) + (latents**2)) * padding_mask, [1, 2]) - log_determinant_sum
-            # return nll + logq
-        else:
-            flows = list(reversed(self.flows))
-            flows = flows[:-2] + [flows[-1]]  # remove a useless vflow
-            latents = (
-                torch.randn(inputs.size(0), 2, inputs.size(2)).to(device=inputs.device, dtype=inputs.dtype)
-                * noise_scale
-            )
-            for flow in flows:
-                latents = torch.flip(latents, [1])
-                latents, _ = flow(latents, padding_mask, global_conditioning=inputs, reverse=True)
-            log_duration, _ = torch.split(latents, [1, 1], dim=1)
-            return log_duration
 class VitsAttention(nn.Module):
-    """Multi-headed attention with relative positional representation."""
     def __init__(self, config):
         super().__init__()
@@ -793,36 +351,22 @@ class VitsAttention(nn.Module):
         self.scaling = self.head_dim**-0.5
         if (self.head_dim * self.num_heads) != self.embed_dim:
-            raise ValueError(
-                f"hidden_size must be divisible by num_attention_heads (got `hidden_size`: {self.embed_dim}"
-                f" and `num_attention_heads`: {self.num_heads})."
-            )
         self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.use_bias)
         self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.use_bias)
         self.q_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.use_bias)
         self.out_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.use_bias)
-        if self.window_size:
-            # Those provide relative pos embs for k/v interpolated from 2*4+1 to 1027 time frames - duration of txt
-            self.emb_rel_k = nn.Parameter(torch.randn(1, self.window_size * 2 + 1, self.head_dim) * self.scaling)
-            self.emb_rel_v = nn.Parameter(torch.randn(1, self.window_size * 2 + 1, self.head_dim) * self.scaling)
     def _shape(self, tensor, seq_len, bsz):
         return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
     def forward(
         self,
         hidden_states,
-        key_value_states: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        layer_head_mask: Optional[torch.Tensor] = None,
-        output_attentions: bool = False,
     ):
-        """Input shape: Batch x Time x Channel"""
-        # if key_value_states are provided this layer is used as a cross-attention layer
-        # for the decoder
         bsz, tgt_len, _ = hidden_states.size()
@@ -840,36 +384,9 @@ class VitsAttention(nn.Module):
         src_len = key_states.size(1)
         attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
-        if self.window_size is not None:
-            # 4
-            # key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, src_len)
-            key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, src_len)   # try fix k.shape[2] to have consistent voice deu
-            # print(f'{self.emb_rel_k.shape=} {key_relative_embeddings.shape=}\n\nL855')
-            relative_logits = torch.matmul(query_states, key_relative_embeddings.transpose(-2, -1))
-            # -- only here (key)
-            rel_pos_bias = self._relative_position_to_absolute_position(relative_logits)
-            attn_weights += rel_pos_bias
-        if attention_mask is not None:
-            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
-            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
         attn_weights = nn.functional.softmax(attn_weights, dim=-1)
         attn_output = torch.bmm(attn_weights,
                                 value_states)
-        if self.window_size is not None:
-            # Entering here with self.window_size = 4
-            value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, src_len)
-            relative_weights = self._absolute_position_to_relative_position(attn_weights)
-            rel_pos_bias = torch.matmul(relative_weights, value_relative_embeddings)
-            attn_output += rel_pos_bias
         attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
         attn_output = attn_output.transpose(1, 2)
@@ -881,42 +398,6 @@ class VitsAttention(nn.Module):
         return attn_output, None #attn_weights_reshaped
-    def _get_relative_embeddings(self, relative_embeddings, length):
-        pad_length = max(length - (self.window_size + 1), 0)
-        if pad_length > 0:
-            relative_embeddings = nn.functional.pad(relative_embeddings, [0, 0, pad_length, pad_length, 0, 0])
-        slice_start_position = max((self.window_size + 1) - length, 0)
-        slice_end_position = slice_start_position + 2 * length - 1
-        return relative_embeddings[:, slice_start_position:slice_end_position]
-    def _relative_position_to_absolute_position(self, x):
-        batch_heads, length, _ = x.size()
-        # Concat columns of pad to shift from relative to absolute indexing.
-        x = nn.functional.pad(x, [0, 1, 0, 0, 0, 0])
-        # Concat extra elements so to add up to shape (len+1, 2*len-1).
-        x_flat = x.view([batch_heads, length * 2 * length])
-        x_flat = nn.functional.pad(x_flat, [0, length - 1, 0, 0])
-        # Reshape and slice out the padded elements.
-        x_final = x_flat.view([batch_heads, length + 1, 2 * length - 1])
-        x_final = x_final[:, :length, length - 1 :]
-        return x_final
-    def _absolute_position_to_relative_position(self, x):
-        batch_heads, length, _ = x.size()
-        # Pad along column
-        x = nn.functional.pad(x, [0, length - 1, 0, 0, 0, 0])
-        x_flat = x.view([batch_heads, length * (2 * length - 1)])
-        # Add 0's in the beginning that will skew the elements after reshape
-        x_flat = nn.functional.pad(x_flat, [length, 0, 0, 0])
-        x_final = x_flat.view([batch_heads, length, 2 * length])[:, :, 1:]
-        return x_final
 class VitsFeedForward(nn.Module):
     def __init__(self, config):
@@ -933,25 +414,15 @@ class VitsFeedForward(nn.Module):
         else:
             self.padding = None
-    def forward(self, hidden_states, padding_mask):
         hidden_states = hidden_states.permute(0, 2, 1)
-        padding_mask = padding_mask.permute(0, 2, 1)
-        hidden_states = hidden_states * padding_mask
         if self.padding is not None:
             hidden_states = nn.functional.pad(hidden_states, self.padding)
         hidden_states = self.conv_1(hidden_states)
         hidden_states = self.act_fn(hidden_states)
-        hidden_states = hidden_states * padding_mask
         if self.padding is not None:
             hidden_states = nn.functional.pad(hidden_states, self.padding)
         hidden_states = self.conv_2(hidden_states)
-        hidden_states = hidden_states * padding_mask
         hidden_states = hidden_states.permute(0, 2, 1)
         return hidden_states
@@ -960,22 +431,19 @@ class VitsEncoderLayer(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.attention = VitsAttention(config)
         self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
         self.feed_forward = VitsFeedForward(config)
         self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
     def forward(
         self,
-        hidden_states: torch.Tensor,
-        padding_mask: torch.FloatTensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        output_attentions: bool = False,
     ):
         residual = hidden_states
         hidden_states, attn_weights = self.attention(
             hidden_states=hidden_states,
-            attention_mask=attention_mask,
             output_attentions=output_attentions,
         )
@@ -983,15 +451,12 @@ class VitsEncoderLayer(nn.Module):
         hidden_states = self.layer_norm(residual + hidden_states)
         residual = hidden_states
-        hidden_states = self.feed_forward(hidden_states, padding_mask)
         hidden_states = self.final_layer_norm(residual + hidden_states)
         outputs = (hidden_states,)
-        if output_attentions:
-            outputs += (attn_weights,)
         return outputs
@@ -1005,52 +470,24 @@ class VitsEncoder(nn.Module):
     def forward(
         self,
-        hidden_states: torch.FloatTensor,
-        padding_mask: torch.FloatTensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
     ):
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attentions = () if output_attentions else None
-        # expand attention_mask
-        if attention_mask is not None:
-            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-            attention_mask = _prepare_4d_attention_mask(attention_mask, hidden_states.dtype)
-        hidden_states = hidden_states * padding_mask
-        for encoder_layer in self.layers:
-            if output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states,)
-            layer_outputs = encoder_layer(
-                hidden_states,
-                attention_mask=attention_mask,
-                padding_mask=padding_mask,
-                output_attentions=output_attentions,
-            )
             hidden_states = layer_outputs[0]
-            if output_attentions:
-                all_self_attentions = all_self_attentions + (layer_outputs[1],)
-        hidden_states = hidden_states * padding_mask
         return BaseModelOutput(
             last_hidden_state=hidden_states,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attentions,
         )
 class VitsTextEncoder(nn.Module):
     """
-    Transformer encoder that uses relative positional representation instead of absolute positional encoding.
     """
     def __init__(self, config):
@@ -1060,75 +497,30 @@ class VitsTextEncoder(nn.Module):
         self.encoder = VitsEncoder(config)  # 6 Layers of VitsAttention
         self.project = nn.Conv1d(config.hidden_size, config.flow_size * 2, kernel_size=1)
-    # def get_input_embeddings(self):
-    #     return self.embed_tokens
-    # def set_input_embeddings(self, value):
-    #     self.embed_tokens = value
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        padding_mask: torch.FloatTensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = True,
-    ):
-        hidden_states = self.embed_tokens(input_ids) * math.sqrt(self.config.hidden_size)
-        encoder_outputs = self.encoder(
-            hidden_states=hidden_states,
-            padding_mask=padding_mask,
-            attention_mask=attention_mask,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        last_hidden_state = encoder_outputs[0] if not return_dict else encoder_outputs.last_hidden_state
-        stats = self.project(last_hidden_state.transpose(1, 2)).transpose(1, 2) * padding_mask
         prior_means, prior_log_variances = torch.split(stats, self.config.flow_size, dim=2)
         return VitsTextEncoderOutput(
             last_hidden_state=last_hidden_state,
             prior_means=prior_means,
-            prior_log_variances=prior_log_variances,
-            hidden_states=encoder_outputs.hidden_states,
-            attentions=encoder_outputs.attentions,
         )
 class VitsPreTrainedModel(PreTrainedModel):
-    """
-    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
-    models.
-    """
     config_class = VitsConfig
     base_model_prefix = "vits"
     main_input_name = "input_ids"
     supports_gradient_checkpointing = True
-    def _init_weights(self, module):
-        """Initialize the weights"""
-        if isinstance(module, nn.Linear):
-            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.LayerNorm):
-            module.bias.data.zero_()
-            module.weight.data.fill_(1.0)
-        elif isinstance(module, nn.Conv1d):
-            nn.init.kaiming_normal_(module.weight)
-            if module.bias is not None:
-                k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
-                nn.init.uniform_(module.bias, a=-k, b=k)
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
 class VitsModel(VitsPreTrainedModel):
@@ -1138,27 +530,9 @@ class VitsModel(VitsPreTrainedModel):
         self.text_encoder = VitsTextEncoder(config)  # has VitsEncoder that includes 6L of VitsAttention
         self.flow = VitsResidualCouplingBlock(config)
         self.decoder = VitsHifiGan(config)
-        if config.use_stochastic_duration_prediction:
-            self.duration_predictor = VitsStochasticDurationPredictor(config)
-        else:
-            raise ValueError
-            # self.duration_predictor = VitsDurationPredictor(config)
-        if config.num_speakers > 1:
-            self.embed_speaker = nn.Embedding(config.num_speakers, config.speaker_embedding_size)
-        self.noise_scale = config.noise_scale
-        self.noise_scale_duration = config.noise_scale_duration
         # Initialize weights and apply final processing
         self.post_init()
-    def get_encoder(self):
-        return self.text_encoder
     def forward(
         self,
         input_ids = None,
@@ -1168,69 +542,37 @@ class VitsModel(VitsPreTrainedModel):
         output_hidden_states = None,
         return_dict = None,
         labels = None,
-        speed=None,
     ):
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        if labels is not None:
-            raise NotImplementedError("Training of VITS is not supported yet.")
         mask_dtype = self.text_encoder.embed_tokens.weight.dtype
         if attention_mask is not None:
             input_padding_mask = attention_mask.unsqueeze(-1).to(mask_dtype)
         else:
             input_padding_mask = torch.ones_like(input_ids).unsqueeze(-1).to(mask_dtype)
-        if self.config.num_speakers > 1 and speaker_id is not None:
-            if not 0 <= speaker_id < self.config.num_speakers:
-                raise ValueError(f"Set `speaker_id` in the range 0-{self.config.num_speakers - 1}.")
-            if isinstance(speaker_id, int):
-                speaker_id = torch.full(size=(1,), fill_value=speaker_id, device=self.device)
-            speaker_embeddings = self.embed_speaker(speaker_id).unsqueeze(-1)
-        else:
-            speaker_embeddings = None
-        text_encoder_output = self.text_encoder(
-            input_ids=input_ids,
-            padding_mask=input_padding_mask,
-            attention_mask=attention_mask,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        hidden_states = text_encoder_output[0] if not return_dict else text_encoder_output.last_hidden_state
-        hidden_states = hidden_states.transpose(1, 2)
         input_padding_mask = input_padding_mask.transpose(1, 2)
-        prior_means = text_encoder_output[1] if not return_dict else text_encoder_output.prior_means
-        prior_log_variances = text_encoder_output[2] if not return_dict else text_encoder_output.prior_log_variances
-        if self.config.use_stochastic_duration_prediction:
-            log_duration = self.duration_predictor(
-                hidden_states,
-                input_padding_mask,
-                speaker_embeddings,
-                reverse=True,
-                noise_scale=self.noise_scale_duration,
-            )
         else:
-            raise ValueError
-            # log_duration = self.duration_predictor(hidden_states, input_padding_mask, speaker_embeddings)
-        length_scale = 1.0 / speed
-        duration = torch.ceil(torch.exp(log_duration) * input_padding_mask * length_scale)
         predicted_lengths = torch.clamp_min(torch.sum(duration, [1, 2]), 1).long()
-        # Create a padding mask for the output lengths of shape (batch, 1, max_output_length)
         indices = torch.arange(predicted_lengths.max(), dtype=predicted_lengths.dtype, device=predicted_lengths.device)
         output_padding_mask = indices.unsqueeze(0) < predicted_lengths.unsqueeze(1)
         output_padding_mask = output_padding_mask.unsqueeze(1).to(input_padding_mask.dtype)
-        # Reconstruct an attention tensor of shape (batch, 1, out_length, in_length)
         attn_mask = torch.unsqueeze(input_padding_mask, 2) * torch.unsqueeze(output_padding_mask, -1)
         batch_size, _, output_length, input_length = attn_mask.shape
         cum_duration = torch.cumsum(duration, -1).view(batch_size * input_length, 1)
@@ -1239,106 +581,30 @@ class VitsModel(VitsPreTrainedModel):
         valid_indices = valid_indices.to(attn_mask.dtype).view(batch_size, input_length, output_length)
         padded_indices = valid_indices - nn.functional.pad(valid_indices, [0, 0, 1, 0, 0, 0])[:, :-1]
         attn = padded_indices.unsqueeze(1).transpose(2, 3) * attn_mask
-        # Expand prior distribution
-        prior_means = torch.matmul(attn.squeeze(1), prior_means).transpose(1, 2)
-        prior_log_variances = torch.matmul(attn.squeeze(1), prior_log_variances).transpose(1, 2)
-        prior_latents = prior_means + torch.randn_like(prior_means) * torch.exp(prior_log_variances) * self.noise_scale
-        latents = self.flow(prior_latents, output_padding_mask, speaker_embeddings, reverse=True)
-        spectrogram = latents * output_padding_mask
-        waveform = self.decoder(spectrogram, speaker_embeddings)
-        waveform = waveform.squeeze(1)
-        sequence_lengths = predicted_lengths * np.prod(self.config.upsample_rates)
-        if not return_dict:
-            outputs = (waveform, sequence_lengths, spectrogram) + text_encoder_output[3:]
-            return outputs
-        return waveform
-# ================================================ tokenization
-# coding=utf-8
-# Copyright 2023 The Kakao Enterprise Authors, the MMS-TTS Authors and the HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tokenization class for VITS."""
-import json
-import os
-import re
-from typing import Any, Dict, List, Optional, Tuple, Union
-from transformers.tokenization_utils import PreTrainedTokenizer
-from transformers.utils import is_phonemizer_available, is_uroman_available
-if is_phonemizer_available():
-    import phonemizer
-if is_uroman_available():
-    import uroman as ur
-VOCAB_FILES_NAMES = {"vocab_file": "vocab.json"}
-def has_non_roman_characters(input_string):
-    # Find any character outside the ASCII range
-    non_roman_pattern = re.compile(r"[^\x00-\x7F]")
-    # Search the input string for non-Roman characters
-    match = non_roman_pattern.search(input_string)
-    has_non_roman = match is not None
-    return has_non_roman
 class VitsTokenizer(PreTrainedTokenizer):
-    """
-    Construct a VITS tokenizer. Also supports MMS-TTS.
-    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
-    this superclass for more information regarding those methods.
-    Args:
-        vocab_file (`str`):
-            Path to the vocabulary file.
-        language (`str`, *optional*):
-            Language identifier.
-        add_blank (`bool`, *optional*, defaults to `True`):
-            Whether to insert token id 0 in between the other tokens.
-        normalize (`bool`, *optional*, defaults to `True`):
-            Whether to normalize the input text by removing all casing and punctuation.
-        phonemize (`bool`, *optional*, defaults to `True`):
-            Whether to convert the input text into phonemes.
-        is_uroman (`bool`, *optional*, defaults to `False`):
-            Whether the `uroman` Romanizer needs to be applied to the input text prior to tokenizing.
-    """
-    vocab_files_names = VOCAB_FILES_NAMES
     model_input_names = ["input_ids", "attention_mask"]
     def __init__(
@@ -1412,12 +678,8 @@ class VitsTokenizer(PreTrainedTokenizer):
         return text
     def prepare_for_tokenization(
-        self, text: str, is_split_into_words: bool = False, normalize: Optional[bool] = None, **kwargs
-    ) -> Tuple[str, Dict[str, Any]]:
-        '''
-            Performs any necessary transformations before tokenization.
-        '''
         normalize = normalize if normalize is not None else self.normalize
         if normalize:
@@ -1462,21 +724,18 @@ class VitsTokenizer(PreTrainedTokenizer):
         tokens = list(text)
         if self.add_blank:
-            interspersed = [self._convert_id_to_token(0)] * (len(tokens) * 2 + 1)
-            interspersed[1::2] = tokens
-            tokens = interspersed
         return tokens
-    def convert_tokens_to_string(self, tokens: List[str]) -> str:
-        if self.add_blank and len(tokens) > 1:
-            tokens = tokens[1::2]
-        return "".join(tokens)
     def _convert_token_to_id(self, token):
         """Converts a token (str) in an id using the vocab."""
         return self.encoder.get(token, self.encoder.get(self.unk_token))
     def _convert_id_to_token(self, index):
         """Converts an index (integer) in a token (str) using the vocab."""
-        return self.decoder.get(index)

 import math
 from dataclasses import dataclass
 import numpy as np
 import torch
 from torch import nn
 from transformers.modeling_outputs import BaseModelOutput, ModelOutput
 from transformers.modeling_utils import PreTrainedModel
 from transformers.configuration_utils import PretrainedConfig
+import json
+import os
+import re
+from typing import Any, Dict, List, Optional, Tuple
+from transformers.tokenization_utils import PreTrainedTokenizer
+import phonemizer
+import uroman as ur
+import torch.nn.functional as F
+def has_non_roman_characters(input_string):
+    # Find any character outside the ASCII range
+    non_roman_pattern = re.compile(r"[^\x00-\x7F]")
+    # Search the input string for non-Roman characters
+    match = non_roman_pattern.search(input_string)
+    has_non_roman = match is not None
+    return has_non_roman
 class VitsConfig(PretrainedConfig):
         self.ffn_kernel_size = ffn_kernel_size
         self.flow_size = flow_size
         self.spectrogram_bins = spectrogram_bins
         self.initializer_range = initializer_range
         self.layer_norm_eps = layer_norm_eps
+        # self.use_stochastic_duration_prediction = use_stochastic_duration_prediction
         self.num_speakers = num_speakers
         self.speaker_embedding_size = speaker_embedding_size
         self.upsample_initial_channel = upsample_initial_channel
         self.duration_predictor_flow_bins = duration_predictor_flow_bins
         self.duration_predictor_tail_bound = duration_predictor_tail_bound
         self.duration_predictor_kernel_size = duration_predictor_kernel_size
         self.duration_predictor_num_flows = duration_predictor_num_flows
         self.duration_predictor_filter_channels = duration_predictor_filter_channels
         self.prior_encoder_num_flows = prior_encoder_num_flows
         self.posterior_encoder_num_wavenet_layers = posterior_encoder_num_wavenet_layers
         self.wavenet_kernel_size = wavenet_kernel_size
         self.wavenet_dilation_rate = wavenet_dilation_rate
         self.noise_scale = noise_scale
         self.noise_scale_duration = noise_scale_duration
         self.sampling_rate = sampling_rate
     last_hidden_state: torch.FloatTensor = None
     prior_means: torch.FloatTensor = None
     prior_log_variances: torch.FloatTensor = None
+    hidden_states: torch.FloatTensor = None
+    attentions: torch.FloatTensor = None
 class VitsWaveNet(torch.nn.Module):
         super().__init__()
         self.hidden_size = config.hidden_size
         self.num_layers = num_layers
         self.in_layers = torch.nn.ModuleList()
         self.res_skip_layers = torch.nn.ModuleList()
+        # if hasattr(nn.utils.parametrizations, "weight_norm"):
+        #     # raise ValueError
+        weight_norm = nn.utils.parametrizations.weight_norm
+        # else:
+        #     raise ValueError
+        #     # weight_norm = nn.utils.weight_norm
         for i in range(num_layers):
             dilation = config.wavenet_dilation_rate**i
             padding = (config.wavenet_kernel_size * dilation - dilation) // 2
                 res_skip_channels = 2 * config.hidden_size
             else:
                 res_skip_channels = config.hidden_size
             res_skip_layer = torch.nn.Conv1d(config.hidden_size, res_skip_channels, 1)
             res_skip_layer = weight_norm(res_skip_layer, name="weight")
             self.res_skip_layers.append(res_skip_layer)
+    def forward(self,
+                inputs):
         outputs = torch.zeros_like(inputs)
         num_channels = torch.IntTensor([self.hidden_size])[0]
         for i in range(self.num_layers):
             in_act = self.in_layers[i](inputs)
             # global_states = torch.zeros_like(hidden_states)  # style ?
             # acts = fused_add_tanh_sigmoid_multiply(hidden_states, global_states, num_channels_tensor[0])
             # --
             # def fused_add_tanh_sigmoid_multiply(input_a, input_b, num_channels):
             # in_act = input_a #  + input_b
             t_act = torch.tanh(in_act[:, :num_channels, :])
             s_act = torch.sigmoid(in_act[:, num_channels:, :])
             acts = t_act * s_act
             res_skip_acts = self.res_skip_layers[i](acts)
             if i < self.num_layers - 1:
                 res_acts = res_skip_acts[:, : self.hidden_size, :]
+                inputs = inputs + res_acts
                 outputs = outputs + res_skip_acts[:, self.hidden_size :, :]
             else:
                 outputs = outputs + res_skip_acts
+        return outputs
     def get_padding(self, kernel_size, dilation=1):
         return (kernel_size * dilation - dilation) // 2
     def forward(self, hidden_states):
         for conv1, conv2 in zip(self.convs1, self.convs2):
             residual = hidden_states
             channels = config.upsample_initial_channel // (2 ** (i + 1))
             for kernel_size, dilation in zip(config.resblock_kernel_sizes, config.resblock_dilation_sizes):
                 self.resblocks.append(HifiGanResidualBlock(channels, kernel_size, dilation, config.leaky_relu_slope))
         self.conv_post = nn.Conv1d(channels, 1, kernel_size=7, stride=1, padding=3, bias=False)
+    def forward(self,
+                spectrogram):
         hidden_states = self.conv_pre(spectrogram)
         for i in range(self.num_upsamples):
             hidden_states = nn.functional.leaky_relu(hidden_states, self.config.leaky_relu_slope)
             hidden_states = self.upsampler[i](hidden_states)
             res_state = self.resblocks[i * self.num_kernels](hidden_states)
             for j in range(1, self.num_kernels):
                 res_state += self.resblocks[i * self.num_kernels + j](hidden_states)
             hidden_states = res_state / self.num_kernels
         hidden_states = nn.functional.leaky_relu(hidden_states)
         hidden_states = self.conv_post(hidden_states)
         waveform = torch.tanh(hidden_states)
     def __init__(self, config):
         super().__init__()
         self.half_channels = config.flow_size // 2
         self.conv_pre = nn.Conv1d(self.half_channels, config.hidden_size, 1)
         self.wavenet = VitsWaveNet(config, num_layers=config.prior_encoder_num_wavenet_layers)
         self.conv_post = nn.Conv1d(config.hidden_size, self.half_channels, 1)
+    def forward(self,
+                x,
+                reverse=False):
+        first_half, second_half = torch.split(x, [self.half_channels] * 2, dim=1)
+        hidden_states = self.conv_pre(first_half)
+        hidden_states = self.wavenet(hidden_states)
+        mean = self.conv_post(hidden_states)
+        second_half = (second_half - mean)
+        outputs = torch.cat([first_half, second_half], dim=1)
+        return outputs
 class VitsResidualCouplingBlock(nn.Module):
         for _ in range(config.prior_encoder_num_flows):
             self.flows.append(VitsResidualCouplingLayer(config))
+    def forward(self, x, reverse=False):
+        # x L [1, 192, 481]
+        for flow in reversed(self.flows):
+            x = torch.flip(x, [1])  # flipud CHANNELs
+            x = flow(x, reverse=True)
+        return x
 class VitsAttention(nn.Module):
+    """has no positional info"""
     def __init__(self, config):
         super().__init__()
         self.scaling = self.head_dim**-0.5
         if (self.head_dim * self.num_heads) != self.embed_dim:
+            raise ValueError
         self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.use_bias)
         self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.use_bias)
         self.q_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.use_bias)
         self.out_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.use_bias)
     def _shape(self, tensor, seq_len, bsz):
         return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
     def forward(
         self,
         hidden_states,
+        layer_head_mask = None,
+        output_attentions = False,
     ):
         bsz, tgt_len, _ = hidden_states.size()
         src_len = key_states.size(1)
         attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
         attn_weights = nn.functional.softmax(attn_weights, dim=-1)
         attn_output = torch.bmm(attn_weights,
                                 value_states)
         attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
         attn_output = attn_output.transpose(1, 2)
         return attn_output, None #attn_weights_reshaped
 class VitsFeedForward(nn.Module):
     def __init__(self, config):
         else:
             self.padding = None
+    def forward(self, hidden_states):
         hidden_states = hidden_states.permute(0, 2, 1)
         if self.padding is not None:
             hidden_states = nn.functional.pad(hidden_states, self.padding)
         hidden_states = self.conv_1(hidden_states)
         hidden_states = self.act_fn(hidden_states)
         if self.padding is not None:
             hidden_states = nn.functional.pad(hidden_states, self.padding)
         hidden_states = self.conv_2(hidden_states)
         hidden_states = hidden_states.permute(0, 2, 1)
         return hidden_states
     def __init__(self, config):
         super().__init__()
         self.attention = VitsAttention(config)
         self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
         self.feed_forward = VitsFeedForward(config)
         self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
     def forward(
         self,
+        hidden_states,
+        output_attentions = False,
     ):
         residual = hidden_states
         hidden_states, attn_weights = self.attention(
             hidden_states=hidden_states,
+            # attention_mask=attention_mask,
             output_attentions=output_attentions,
         )
         hidden_states = self.layer_norm(residual + hidden_states)
         residual = hidden_states
+        hidden_states = self.feed_forward(hidden_states)
         hidden_states = self.final_layer_norm(residual + hidden_states)
         outputs = (hidden_states,)
         return outputs
     def forward(
         self,
+        hidden_states,
+        output_attentions = None,
+        output_hidden_states = None,
+        return_dict = None,
     ):
+        for _layer in self.layers:
+            layer_outputs = _layer(hidden_states)
             hidden_states = layer_outputs[0]
         return BaseModelOutput(
             last_hidden_state=hidden_states,
+            # hidden_states=all_hidden_states,
+            # attentions=all_self_attentions,
         )
 class VitsTextEncoder(nn.Module):
     """
+    Has VitsEncoder
     """
     def __init__(self, config):
         self.encoder = VitsEncoder(config)  # 6 Layers of VitsAttention
         self.project = nn.Conv1d(config.hidden_size, config.flow_size * 2, kernel_size=1)
+    def forward(self,
+                input_ids
+                ):
+        hidden_states = self.embed_tokens(input_ids)   * math.sqrt(self.config.hidden_size)
+        last_hidden_state = self.encoder(hidden_states=hidden_states).last_hidden_state
+        stats = self.project(last_hidden_state.transpose(1, 2)).transpose(1, 2)
         prior_means, prior_log_variances = torch.split(stats, self.config.flow_size, dim=2)
         return VitsTextEncoderOutput(
             last_hidden_state=last_hidden_state,
             prior_means=prior_means,
+            # prior_log_variances=prior_log_variances,
+            # hidden_states=encoder_outputs.hidden_states,
+            # attentions=encoder_outputs.attentions,
         )
 class VitsPreTrainedModel(PreTrainedModel):
     config_class = VitsConfig
     base_model_prefix = "vits"
     main_input_name = "input_ids"
     supports_gradient_checkpointing = True
 class VitsModel(VitsPreTrainedModel):
         self.text_encoder = VitsTextEncoder(config)  # has VitsEncoder that includes 6L of VitsAttention
         self.flow = VitsResidualCouplingBlock(config)
         self.decoder = VitsHifiGan(config)
         # Initialize weights and apply final processing
         self.post_init()
     def forward(
         self,
         input_ids = None,
         output_hidden_states = None,
         return_dict = None,
         labels = None,
+        speed = None,
+        lang_code = 'deu',  # speed oscillation pattern per voice/lang
     ):
         mask_dtype = self.text_encoder.embed_tokens.weight.dtype
         if attention_mask is not None:
             input_padding_mask = attention_mask.unsqueeze(-1).to(mask_dtype)
         else:
             input_padding_mask = torch.ones_like(input_ids).unsqueeze(-1).to(mask_dtype)
+        out = self.text_encoder(input_ids=input_ids)
+        hidden_states = out.last_hidden_state.transpose(1, 2)
         input_padding_mask = input_padding_mask.transpose(1, 2)
+        prior_means = out.prior_means
+        bs, _, in_len = hidden_states.shape
+        # VITS Duration Oscillation
+        if lang_code == 'deu':
+            pattern = [1, 2, 1]  # each voice (lang_code) sounds cooler with different pattern
+        elif lang_code == 'rmc-script_latin':
+            pattern = [2, 2, 1, 2, 2]   # [2, 2, 2, 1, 2]
+        elif lang_code == 'hun':
+            # pattern = [1, 2, 2, 1, 1, 1] #sounds cool / has valley-pause
+            pattern = [1, 2, 1, 1, 1]
         else:
+            pattern = [1, 2, 1]
+        duration = torch.tensor(pattern, device=hidden_states.device).repeat(int(in_len / len(pattern)) + 2)[None, None, :in_len]   # perhaps define [1, 2, 1] per voice or language
+        duration[:, :, 0] = 4
+        duration[:, :, -1] = 3
+        # ATTN
         predicted_lengths = torch.clamp_min(torch.sum(duration, [1, 2]), 1).long()
         indices = torch.arange(predicted_lengths.max(), dtype=predicted_lengths.dtype, device=predicted_lengths.device)
         output_padding_mask = indices.unsqueeze(0) < predicted_lengths.unsqueeze(1)
         output_padding_mask = output_padding_mask.unsqueeze(1).to(input_padding_mask.dtype)
         attn_mask = torch.unsqueeze(input_padding_mask, 2) * torch.unsqueeze(output_padding_mask, -1)
         batch_size, _, output_length, input_length = attn_mask.shape
         cum_duration = torch.cumsum(duration, -1).view(batch_size * input_length, 1)
         valid_indices = valid_indices.to(attn_mask.dtype).view(batch_size, input_length, output_length)
         padded_indices = valid_indices - nn.functional.pad(valid_indices, [0, 0, 1, 0, 0, 0])[:, :-1]
         attn = padded_indices.unsqueeze(1).transpose(2, 3) * attn_mask
+        attn = attn[:, 0, :, :]
+        attn = attn + 1e-4 * torch.rand_like(attn)
+        attn /= attn.sum(2, keepdims=True)
+        #print(attn)
+        prior_means = torch.matmul(attn, prior_means)  # try attn to contain .5/.5 instead of 1/0 so it smoothly interpolates repeated prior_means
+        #prior_means = F.interpolate(prior_means.transpose(1,2),   int(1.74 * prior_means.shape[1]), mode='linear').transpose(1,2)  # extend for slow speed
+        # prior means have now been replicated x duration of each prior mean
+        latents = self.flow(prior_means.transpose(1, 2), # + torch.randn_like(prior_means) * .94,
+                            reverse=True)
+        waveform = self.decoder(latents)  # [bs, 1, 16000]
+        return waveform[:, 0, :]
 class VitsTokenizer(PreTrainedTokenizer):
+    vocab_files_names = {"vocab_file": "vocab.json"}
     model_input_names = ["input_ids", "attention_mask"]
     def __init__(
         return text
     def prepare_for_tokenization(
+        self, text: str, is_split_into_words: bool = False, normalize = None, **kwargs):
         normalize = normalize if normalize is not None else self.normalize
         if normalize:
         tokens = list(text)
         if self.add_blank:
+            # sounds dyslexi if no space between letters
+            # sounds disconnected if >2 spaces between letters
+            interspersed = [self._convert_id_to_token(0)] * (len(tokens) * 2) # + 1)  # +1 rises slice index error if tokens odd
+            interspersed[::2] = tokens
+            tokens = interspersed + [self._convert_id_to_token(0)]  # append one last space (it has indexing error ::2 mismatch if tokens is odd)
         return tokens
     def _convert_token_to_id(self, token):
         """Converts a token (str) in an id using the vocab."""
         return self.encoder.get(token, self.encoder.get(self.unk_token))
     def _convert_id_to_token(self, index):
         """Converts an index (integer) in a token (str) using the vocab."""
+        return self.decoder.get(index)

README.md CHANGED Viewed

@@ -131,7 +131,7 @@ python live_demo.py  # type text & plays AudioGen sound & TTS
 # Audiobook
-Create audiobook from `.docx`. Listen to it - YouTube [male voice](https://www.youtube.com/watch?v=5-cpf7u18JE) / [v2](https://www.youtube.com/watch?v=Pzo-kKaNg6s) / [v2.1](https://www.youtube.com/watch?v=X4qlKBBaegM)/ [no diffusio](https://www.youtube.com/watch?v=vahKXpd6oLg)
 ```python
 #  audiobook will be saved in ./tts_audiobooks

 # Audiobook
+Create audiobook from `.docx`. Listen to it - YouTube [male voice](https://www.youtube.com/watch?v=5-cpf7u18JE) / [v2](https://www.youtube.com/watch?v=Pzo-kKaNg6s) / [v2.1](https://www.youtube.com/watch?v=X4qlKBBaegM)/ [no diffusio](https://www.youtube.com/watch?v=vahKXpd6oLg) [Audionar](https://youtu.be/fUGpfq_o_CU) / [F](https://www.youtube.com/watch?v=tlRdRV5nm40)
 ```python
 #  audiobook will be saved in ./tts_audiobooks

Utils/JDC/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	-

Utils/JDC/bst.pth DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:54dc94364b97e18ac1dfa6287714ed121248cfaac4cfd39d061c6e0a089ef169
-size 21029926

Utils/JDC/model.py DELETED Viewed

@@ -1,190 +0,0 @@
-"""
-Implementation of model from:
-Kum et al. - "Joint Detection and Classification of Singing Voice Melody Using
-Convolutional Recurrent Neural Networks" (2019)
-Link: https://www.semanticscholar.org/paper/Joint-Detection-and-Classification-of-Singing-Voice-Kum-Nam/60a2ad4c7db43bace75805054603747fcd062c0d
-"""
-import torch
-from torch import nn
-class JDCNet(nn.Module):
-    """
-    Joint Detection and Classification Network model for singing voice melody.
-    """
-    def __init__(self, num_class=722, seq_len=31, leaky_relu_slope=0.01):
-        super().__init__()
-        self.num_class = num_class
-        # input = (b, 1, 31, 513), b = batch size
-        self.conv_block = nn.Sequential(
-            nn.Conv2d(in_channels=1, out_channels=64, kernel_size=3, padding=1, bias=False),  # out: (b, 64, 31, 513)
-            nn.BatchNorm2d(num_features=64),
-            nn.LeakyReLU(leaky_relu_slope, inplace=True),
-            nn.Conv2d(64, 64, 3, padding=1, bias=False),  # (b, 64, 31, 513)
-        )
-        # res blocks
-        self.res_block1 = ResBlock(in_channels=64, out_channels=128)  # (b, 128, 31, 128)
-        self.res_block2 = ResBlock(in_channels=128, out_channels=192)  # (b, 192, 31, 32)
-        self.res_block3 = ResBlock(in_channels=192, out_channels=256)  # (b, 256, 31, 8)
-        # pool block
-        self.pool_block = nn.Sequential(
-            nn.BatchNorm2d(num_features=256),
-            nn.LeakyReLU(leaky_relu_slope, inplace=True),
-            nn.MaxPool2d(kernel_size=(1, 4)),  # (b, 256, 31, 2)
-            nn.Dropout(p=0.2),
-        )
-        # maxpool layers (for auxiliary network inputs)
-        # in = (b, 128, 31, 513) from conv_block, out = (b, 128, 31, 2)
-        self.maxpool1 = nn.MaxPool2d(kernel_size=(1, 40))
-        # in = (b, 128, 31, 128) from res_block1, out = (b, 128, 31, 2)
-        self.maxpool2 = nn.MaxPool2d(kernel_size=(1, 20))
-        # in = (b, 128, 31, 32) from res_block2, out = (b, 128, 31, 2)
-        self.maxpool3 = nn.MaxPool2d(kernel_size=(1, 10))
-        # in = (b, 640, 31, 2), out = (b, 256, 31, 2)
-        self.detector_conv = nn.Sequential(
-            nn.Conv2d(640, 256, 1, bias=False),
-            nn.BatchNorm2d(256),
-            nn.LeakyReLU(leaky_relu_slope, inplace=True),
-            nn.Dropout(p=0.2),
-        )
-        # input: (b, 31, 512) - resized from (b, 256, 31, 2)
-        self.bilstm_classifier = nn.LSTM(
-            input_size=512, hidden_size=256,
-            batch_first=True, bidirectional=True)  # (b, 31, 512)
-        # input: (b, 31, 512) - resized from (b, 256, 31, 2)
-        self.bilstm_detector = nn.LSTM(
-            input_size=512, hidden_size=256,
-            batch_first=True, bidirectional=True)  # (b, 31, 512)
-        # input: (b * 31, 512)
-        self.classifier = nn.Linear(in_features=512, out_features=self.num_class)  # (b * 31, num_class)
-        # input: (b * 31, 512)
-        self.detector = nn.Linear(in_features=512, out_features=2)  # (b * 31, 2) - binary classifier
-        # initialize weights
-        self.apply(self.init_weights)
-    def get_feature_GAN(self, x):
-        seq_len = x.shape[-2]
-        x = x.float().transpose(-1, -2)
-        convblock_out = self.conv_block(x)
-        resblock1_out = self.res_block1(convblock_out)
-        resblock2_out = self.res_block2(resblock1_out)
-        resblock3_out = self.res_block3(resblock2_out)
-        poolblock_out = self.pool_block[0](resblock3_out)
-        poolblock_out = self.pool_block[1](poolblock_out)
-        return poolblock_out.transpose(-1, -2)
-    def get_feature(self, x):
-        seq_len = x.shape[-2]
-        x = x.float().transpose(-1, -2)
-        convblock_out = self.conv_block(x)
-        resblock1_out = self.res_block1(convblock_out)
-        resblock2_out = self.res_block2(resblock1_out)
-        resblock3_out = self.res_block3(resblock2_out)
-        poolblock_out = self.pool_block[0](resblock3_out)
-        poolblock_out = self.pool_block[1](poolblock_out)
-        return self.pool_block[2](poolblock_out)
-    def forward(self, x):
-        """
-        Returns:
-            classification_prediction, detection_prediction
-            sizes: (b, 31, 722), (b, 31, 2)
-        """
-        ###############################
-        # forward pass for classifier #
-        ###############################
-        seq_len = x.shape[-1]
-        x = x.float().transpose(-1, -2)
-        convblock_out = self.conv_block(x)
-        resblock1_out = self.res_block1(convblock_out)
-        resblock2_out = self.res_block2(resblock1_out)
-        resblock3_out = self.res_block3(resblock2_out)
-        poolblock_out = self.pool_block[0](resblock3_out)
-        poolblock_out = self.pool_block[1](poolblock_out)
-        GAN_feature = poolblock_out.transpose(-1, -2)
-        poolblock_out = self.pool_block[2](poolblock_out)
-        # (b, 256, 31, 2) => (b, 31, 256, 2) => (b, 31, 512)
-        classifier_out = poolblock_out.permute(0, 2, 1, 3).contiguous().view((-1, seq_len, 512))
-        classifier_out, _ = self.bilstm_classifier(classifier_out)  # ignore the hidden states
-        classifier_out = classifier_out.contiguous().view((-1, 512))  # (b * 31, 512)
-        classifier_out = self.classifier(classifier_out)
-        classifier_out = classifier_out.view((-1, seq_len, self.num_class))  # (b, 31, num_class)
-        # sizes: (b, 31, 722), (b, 31, 2)
-        # classifier output consists of predicted pitch classes per frame
-        # detector output consists of: (isvoice, notvoice) estimates per frame
-        return torch.abs(classifier_out.squeeze()), GAN_feature, poolblock_out
-    @staticmethod
-    def init_weights(m):
-        if isinstance(m, nn.Linear):
-            nn.init.kaiming_uniform_(m.weight)
-            if m.bias is not None:
-                nn.init.constant_(m.bias, 0)
-        elif isinstance(m, nn.Conv2d):
-            nn.init.xavier_normal_(m.weight)
-        elif isinstance(m, nn.LSTM) or isinstance(m, nn.LSTMCell):
-            for p in m.parameters():
-                if p.data is None:
-                    continue
-                if len(p.shape) >= 2:
-                    nn.init.orthogonal_(p.data)
-                else:
-                    nn.init.normal_(p.data)
-class ResBlock(nn.Module):
-    def __init__(self, in_channels: int, out_channels: int, leaky_relu_slope=0.01):
-        super().__init__()
-        self.downsample = in_channels != out_channels
-        # BN / LReLU / MaxPool layer before the conv layer - see Figure 1b in the paper
-        self.pre_conv = nn.Sequential(
-            nn.BatchNorm2d(num_features=in_channels),
-            nn.LeakyReLU(leaky_relu_slope, inplace=True),
-            nn.MaxPool2d(kernel_size=(1, 2)),  # apply downsampling on the y axis only
-        )
-        # conv layers
-        self.conv = nn.Sequential(
-            nn.Conv2d(in_channels=in_channels, out_channels=out_channels,
-                      kernel_size=3, padding=1, bias=False),
-            nn.BatchNorm2d(out_channels),
-            nn.LeakyReLU(leaky_relu_slope, inplace=True),
-            nn.Conv2d(out_channels, out_channels, 3, padding=1, bias=False),
-        )
-        # 1 x 1 convolution layer to match the feature dimensions
-        self.conv1by1 = None
-        if self.downsample:
-            self.conv1by1 = nn.Conv2d(in_channels, out_channels, 1, bias=False)
-    def forward(self, x):
-        x = self.pre_conv(x)
-        if self.downsample:
-            x = self.conv(x) + self.conv1by1(x)
-        else:
-            x = self.conv(x) + x
-        return x

Utils/PLBERT/util.py CHANGED Viewed

@@ -27,7 +27,7 @@ def load_plbert(log_dir):
     iters = [int(f.split('_')[-1].split('.')[0]) for f in ckpts if os.path.isfile(os.path.join(log_dir, f))]
     iters = sorted(iters)[-1]
-    checkpoint = torch.load(log_dir + "/step_" + str(iters) + ".pth", map_location='cpu')
     state_dict = checkpoint['net']
     from collections import OrderedDict
     new_state_dict = OrderedDict()

     iters = [int(f.split('_')[-1].split('.')[0]) for f in ckpts if os.path.isfile(os.path.join(log_dir, f))]
     iters = sorted(iters)[-1]
+    checkpoint = torch.load(log_dir + "/step_" + str(iters) + ".pth", map_location='cpu', weights_only=True)
     state_dict = checkpoint['net']
     from collections import OrderedDict
     new_state_dict = OrderedDict()

Utils/text_utils.py CHANGED Viewed

@@ -35,72 +35,112 @@ class TextCleaner:
 # == Sentence Splitter
-alphabets = "([A-Za-z])"
-prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
-suffixes = "(Inc|Ltd|Jr|Sr|Co)"
-starters = "(Mr|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
-acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
-websites = "[.](com|net|org|io|gov|edu|me)"
-digits = "([0-9])"
-multiple_dots = r'\.{2,}'
-def split_into_sentences(text):
     """
-    Split the text into sentences.
-    If the text contains substrings "<prd>" or "<stop>", they would lead
-    to incorrect splitting because they are used as markers for splitting.
-    :param text: text to be split into sentences
-    :type text: str
-    :return: list of sentences
-    :rtype: list[str]
-    https://stackoverflow.com/questions/4576077/how-can-i-split-a-text-into-sentences
     """
-    text = " " + text + "  "
-    text = text.replace("\n", " ")
-    text = re.sub(prefixes, "\\1<prd>", text)
-    text = re.sub(websites, "<prd>\\1", text)
-    text = re.sub(digits + "[.]" + digits, "\\1<prd>\\2", text)
-    text = re.sub(multiple_dots, lambda match: "<prd>" *
-                  len(match.group(0)) + "<stop>", text)
-    if "Ph.D" in text:
-        text = text.replace("Ph.D.", "Ph<prd>D<prd>")
-    text = re.sub("\s" + alphabets + "[.] ", " \\1<prd> ", text)
-    text = re.sub(acronyms+" "+starters, "\\1<stop> \\2", text)
-    text = re.sub(alphabets + "[.]" + alphabets + "[.]" +
-                  alphabets + "[.]", "\\1<prd>\\2<prd>\\3<prd>", text)
-    text = re.sub(alphabets + "[.]" + alphabets +
-                  "[.]", "\\1<prd>\\2<prd>", text)
-    text = re.sub(" "+suffixes+"[.] "+starters, " \\1<stop> \\2", text)
-    text = re.sub(" "+suffixes+"[.]", " \\1<prd>", text)
-    text = re.sub(" " + alphabets + "[.]", " \\1<prd>", text)
-    if "”" in text:
-        text = text.replace(".”", "”.")
-    if "\"" in text:
-        text = text.replace(".\"", "\".")
-    if "!" in text:
-        text = text.replace("!\"", "\"!")
-    if "?" in text:
-        text = text.replace("?\"", "\"?")
-    text = text.replace(".", ".<stop>")
-    text = text.replace("?", "?<stop>")
-    text = text.replace("!", "!<stop>")
-    text = text.replace("<prd>", ".")
-    sentences = text.split("<stop>")
-    sentences = [s.strip() for s in sentences]
-    # Split Very long sentences >500 phoneme - StyleTTS2 crashes
-    # -- even 400 phonemes sometimes OOM in cuda:4
-    sentences = [
-        sub_sent+' ' for s in sentences for sub_sent in textwrap.wrap(s, 200, break_long_words=0)]
-    # if sentences and not sentences[-1]:
-    #     sentences = sentences[:-1]
-    return sentences
 def store_ssml(text=None,

 # == Sentence Splitter
+import re
+def split_into_sentences(text, max_len=200):
     """
+    Splits a string into chunks of max_len characters, ensuring each chunk
+    terminates with a period if it was split mid-sentence. Prioritizes
+    splitting at natural sentence breaks and avoids splitting words.
+    Args:
+        text (str): The input string.
+        max_len (int): The maximum desired length for each chunk.
+    Returns:
+        list: A list of strings, where each string is a sentence chunk.
     """
+    if not text:
+        return []
+    # Regex to split text into potential sentence candidates.
+    # We still use the lookbehind to keep the punctuation with the sentence.
+    sentence_candidates = [s.strip() for s in re.split(r'(?<=[.!?])\s+', text) if s.strip()]
+    # Handle the last part if it doesn't end with a punctuation (e.g., a phrase or incomplete sentence)
+    if text and not text.strip().endswith(('.', '!', '?')) and text.strip() not in sentence_candidates:
+        # Check if the last candidate already contains the end of the text.
+        # This is a heuristic, as re.split can sometimes be tricky with trailing non-matches.
+        if not (sentence_candidates and text.strip().endswith(sentence_candidates[-1])):
+            remaining_text = text.strip()
+            if sentence_candidates:
+                # Find the part of the text that wasn't included in sentence_candidates
+                last_candidate_start_index = text.rfind(sentence_candidates[-1])
+                if last_candidate_start_index != -1:
+                    remaining_text = text[last_candidate_start_index + len(sentence_candidates[-1]):].strip()
+            if remaining_text and not remaining_text.endswith(('.', '!', '?')):
+                sentence_candidates.append(remaining_text)
+    chunks = []
+    current_chunk_elements = []  # Stores individual sentences that form the current chunk
+    current_chunk_length = 0
+    for sentence in sentence_candidates:
+        # Calculate the length this sentence would add to the current chunk.
+        # Add 1 for the space that will separate sentences within a chunk, if needed.
+        potential_addition_length = len(sentence) + (1 if current_chunk_elements else 0)
+        # Check if adding this sentence would exceed the maximum length
+        if current_chunk_length + potential_addition_length > max_len:
+            # First, finalize the current chunk
+            if current_chunk_elements:
+                final_chunk = " ".join(current_chunk_elements).strip()
+                chunks.append(final_chunk)
+            # Reset for the new chunk and handle the current `sentence`.
+            # This `sentence` itself might be longer than `max_len`.
+            remaining_sentence = sentence
+            while len(remaining_sentence) > max_len:
+                # Prioritize splitting at a period or a space to avoid splitting words.
+                # Search backwards from `max_len - 1` to find the last valid break point.
+                split_point = -1
+                search_area = remaining_sentence[:max_len]
+                # Option 1: Find the last period in the search area
+                last_period_idx = search_area.rfind('.')
+                if last_period_idx != -1:
+                    split_point = last_period_idx
+                # Option 2: If no period, find the last space (to avoid splitting words)
+                if split_point == -1:
+                    last_space_idx = search_area.rfind(' ')
+                    if last_space_idx != -1:
+                        split_point = last_space_idx
+                if split_point != -1:
+                    # If a period or space is found, split there.
+                    # If it's a period, include it. If it's a space, don't include the space
+                    # but ensure the chunk ends with a period if it didn't already.
+                    chunk_to_add = remaining_sentence[:split_point + (1 if remaining_sentence[split_point] == '.' else 0)].strip()
+                    if not chunk_to_add.endswith('.'):
+                        chunk_to_add += '.' # Ensure period termination
+                    chunks.append(chunk_to_add)
+                    remaining_sentence = remaining_sentence[split_point + 1:].lstrip() # Update remaining
+                else:
+                    # No natural break (period or space) within max_len.
+                    # This happens for extremely long words or sequences without spaces.
+                    # In this rare case, we force split at max_len and append a period.
+                    chunks.append(remaining_sentence[:max_len].strip() + '.')
+                    remaining_sentence = remaining_sentence[max_len:].lstrip() # Update remaining
+            # The `remaining_sentence` (now guaranteed to be `<= max_len`)
+            # becomes the start of the new `current_chunk`.
+            current_chunk_elements = [remaining_sentence]
+            current_chunk_length = len(remaining_sentence)
+        else:
+            # The current sentence fits within the `max_len`, so add it.
+            current_chunk_elements.append(sentence)
+            current_chunk_length += potential_addition_length
+    # After iterating through all sentences, add any remaining elements
+    # in `current_chunk_elements` as the final chunk.
+    if current_chunk_elements:
+        chunks.append(" ".join(current_chunk_elements).strip())
+    return chunks
 def store_ssml(text=None,

api.py CHANGED Viewed

@@ -113,21 +113,11 @@ def _resize(image, width=None, height=None, inter=cv2.INTER_AREA):
 def overlay(x, soundscape=None):
     if soundscape is not None:
-        # AudioGen sound is suffice to be ~10s long
         background = sound_generator.generate(soundscape,
-                                              # sound duration = TTS dur
-                                              duration=len(x)/16000 + .74,
-                                              ).detach().cpu().numpy()  # bs, 11400 @.74s
-        # len_soundscape = len(background)
-        # fading = .5 + .5 * np.tanh(4*(np.linspace(10, -10, len_soundscape) + 9.4))  # fade heaviside  1,1,1,1,...,0
-        # x = np.concatenate([fading * background, x], 0)  # blend TTS with AudioGen
-        # background /= np.abs(background).max() + 1e-7  # amplify speech to full [-1,1]
-        # background will be longer by xtra .74s
-        x = .47 * x + .46 * background[:len(x)]
-    return x  # TTS / AudioGen @ 16kHz
 def tts_multi_sentence(precomputed_style_vector=None,
@@ -176,7 +166,7 @@ def tts_multi_sentence(precomputed_style_vector=None,
     # volume
-    x /= np.abs(x).max() + 1e-7  # amplify speech to full [-1,1]
     return overlay(x, soundscape=soundscape)
@@ -211,7 +201,7 @@ def serve_wav():
         _shorten(r.get('native')[0]),
         affective=r.get('affective')[0],
         voice=r.get('voice')[0],
-        speed=float(r.get('speed')[0]),  # For Non-English MMS TTS
         soundscape=r.get('soundscape')[0] if r.get(
             'soundscape') is not None else None,
     )

 def overlay(x, soundscape=None):
     if soundscape is not None:
         background = sound_generator.generate(soundscape,
+                                              duration=len(x)/16000 + .74,   # duration seconds
+                                              ).detach().cpu().numpy()
+        x = .6 * x + .4 *  background[:len(x)]
+    return x
 def tts_multi_sentence(precomputed_style_vector=None,
     # volume
+    x /= 1.12 * np.abs(x).max() + 1e-7  # amplify speech to full [-1,1] No amplification / normalisation on soundscapes
     return overlay(x, soundscape=soundscape)
         _shorten(r.get('native')[0]),
         affective=r.get('affective')[0],
         voice=r.get('voice')[0],
+        speed=None,  # obsolete due to oscillating MMS TTS VITS duration per language
         soundscape=r.get('soundscape')[0] if r.get(
             'soundscape') is not None else None,
     )

audiobook.py CHANGED Viewed

@@ -8,20 +8,23 @@ import subprocess
 import numpy as np
 import soundfile
 import docx  # package = python-docx
-import audresample
 import urllib
 from pathlib import Path
 from moviepy.editor import *
-FS = 24000
 ROOT_DIR = './tts_audiobooks/voices/'
 Path(ROOT_DIR).mkdir(parents=True,
                      exist_ok=True)
 voices = [
-    # 'en_US/vctk_low#p228',  # https://huggingface.co/dkounadis/artificial-styletts2/discussions/1#67854dcbd3e6beb1a78f7f20
     # 'af_ZA_google-nwu_0184',  # https://huggingface.co/dkounadis/artificial-styletts2/discussions/1#6783e3b00e7d90facec060c6
-    'en_US/vctk_low#p326',   # Native voice
-#    'jv_ID_google-gmu_06207',
     ]  # select any voice from - https://audeering.github.io/shift/
 #urllib.request.urlretrieve("https://github.com/audeering/shift/raw/refs/heads/main/assets/INCLUSION_IN_MUSEUMS_audiobook.docx", "audiobook_TTS.docx")
@@ -54,7 +57,7 @@ for vox in voices:
     total = []
     chapter = []
     final_paragraph_for_saving_last_chapter = d.paragraphs[-1]
     final_paragraph_for_saving_last_chapter.text = 'CHAPTER: END OF AUDIOBOOK'
@@ -69,12 +72,6 @@ for vox in voices:
         if t.startswith('CHAPTER:'):
-            # silence for end chapter
-            chapter.append(np.zeros(int(.24 * FS),
-            dtype=np.float32))
             # chapter.wav
             audio = np.concatenate(chapter)
@@ -116,17 +113,14 @@ for vox in voices:
                 [
                 "python",
                 "tts.py",
-                "--text",
-                "_tmp.txt", #t,         # paragraph text tts and append to voice_chapter.wav
-                # "--affect",
-                #'--image', '_tmp_banner.png',
-                # '--scene', 'calm sounds of castle',
                 '--voice', vox,
                 '--out_file', '_tmp'  # save on _tmp load audio and concat to total
                 ])
-            audio, _fs = soundfile.read('out/_tmp.wav')
-            audio = audresample.resample(audio.astype(np.float32), 24000, 16000)[0, :]
             # print('CHAPTER\n\n\n\n____', audio.shape,'____\n')
             chapter.append(audio)
@@ -140,9 +134,6 @@ for vox in voices:
             if not last_paragraph_was_silence:  # skip multiple empty pargraphs - silence is added only once
-                chapter.append(np.zeros(int(.1 * FS),
-                               dtype=np.float32))
                 last_paragraph_was_silence = True
     # save full .wav audiobook - for this voice
@@ -157,11 +148,7 @@ for vox in voices:
     # pic TTS voice
-    voice_pic = np.zeros((574, 1024, 3), dtype=np.uint8)
-    shift_logo = cv2.imread('assets/shift_banner.png')
-    voice_pic[:100, :400, :] = shift_logo[:100, :400, :]
     # voice name
     # frame_tts = np.zeros((104, 1920, 3), dtype=np.uint8)

 import numpy as np
 import soundfile
 import docx  # package = python-docx
 import urllib
 from pathlib import Path
 from moviepy.editor import *
+FS = 16000
 ROOT_DIR = './tts_audiobooks/voices/'
 Path(ROOT_DIR).mkdir(parents=True,
                      exist_ok=True)
 voices = [
+    # 'en_US/vctk_low#p228',    # https://huggingface.co/dkounadis/artificial-styletts2/discussions/1#67854dcbd3e6beb1a78f7f20
     # 'af_ZA_google-nwu_0184',  # https://huggingface.co/dkounadis/artificial-styletts2/discussions/1#6783e3b00e7d90facec060c6
+    # 'en_US/vctk_low#p326',
+    #'en_US/vctk_low#p292',
+    # 'jv_ID_google-gmu_06207',
+    # 'fr_FR_m-ailabs_bernard'
+     'en_US_m-ailabs_mary_ann'
     ]  # select any voice from - https://audeering.github.io/shift/
 #urllib.request.urlretrieve("https://github.com/audeering/shift/raw/refs/heads/main/assets/INCLUSION_IN_MUSEUMS_audiobook.docx", "audiobook_TTS.docx")
     total = []
     chapter = []
     final_paragraph_for_saving_last_chapter = d.paragraphs[-1]
     final_paragraph_for_saving_last_chapter.text = 'CHAPTER: END OF AUDIOBOOK'
         if t.startswith('CHAPTER:'):
             # chapter.wav
             audio = np.concatenate(chapter)
                 [
                 "python",
                 "tts.py",
+                "--text",
+                "_tmp.txt",
+                '--soundscape', 'birds formig' if chapter_counter < 2 else '',
                 '--voice', vox,
                 '--out_file', '_tmp'  # save on _tmp load audio and concat to total
                 ])
+            audio, _fs = soundfile.read('out/_tmp.wav')  # already 16 kHz
             # print('CHAPTER\n\n\n\n____', audio.shape,'____\n')
             chapter.append(audio)
             if not last_paragraph_was_silence:  # skip multiple empty pargraphs - silence is added only once
                 last_paragraph_was_silence = True
     # save full .wav audiobook - for this voice
     # pic TTS voice
+    voice_pic = np.zeros((1920, 1080, 3), dtype=np.uint8)
     # voice name
     # frame_tts = np.zeros((104, 1920, 3), dtype=np.uint8)

demo.py CHANGED Viewed

@@ -1,68 +1,40 @@
 import numpy as np
 import soundfile
-import msinference
 from audiocraft.builders import AudioGen
-def tts_entry(text='A quick brown fox jumps over the lazy dog. Sweet dreams are made of this, I traveled the world and the seven seas.',
-              voice='en_US/vctk_low#p326', #'en_US/vctk_low#p276',  # 'deu', 'af_ZA_google-nwu_1919', 'serbian', 'isl',
-              speed=1.14,
-              affect = True,  # False = higher clarity voice
-              soundscape = 'dogs barg in dungeons n dragons'
-              ):
-    '''16 KHz
-       voice : 'en_US/vctk_low#p276'  # Native English voices -> https://audeering.github.io/shift/
-          or
-       voice : 'af_ZA_google-nwu_1919' # Non-Native English voices -> https://huggingface.co/dkounadis/artificial-styletts2/discussions/1#6783e3b00e7d90facec060c6
-          or
-       voice : 'deu'  # Foreign languages -> https://huggingface.co/dkounadis/artificial-styletts2/blob/main/Utils/all_langs.csv
        '''
-    # StyleTTS2 - find voice from folder
     if ('en_US/' in voice) or ('en_UK/' in voice):
-        a = '' if affect else '_v2'
-        style_vector = msinference.compute_style('assets/wavs/style_vector' + a + '/' + voice.replace(
                                                 '/', '_').replace('#', '_').replace(
                                                     'cmu-arctic', 'cmu_arctic').replace(
                                                         '_low', '') + '.wav')
-        x = msinference.inference(text,
-                                    style_vector)
-    # find voice from mimic-3 folder with styles
     elif '_' in  voice:
         style_vector = msinference.compute_style('assets/wavs/mimic3_foreign_4x/' + voice.replace(
                                                 '/', '_').replace('#', '_').replace(
                                                     'cmu-arctic', 'cmu_arctic').replace(
                                                         '_low', '') + '.wav')
-        x = msinference.inference(text,
-                                    style_vector)
-    # Fallback - MMS TTS - Non-English voice / langs
     else:
-        x = msinference.foreign(text=text,
-                                lang=voice,
-                                speed=speed)  # volume normalis.
-    # volume
-    x /= np.abs(x).max() + 1e-7  # amplify speech to full [-1,1]
     if soundscape is not None:
         sound_gen = AudioGen().to('cuda:0').eval()
-        background = sound_gen.generate(soundscape,
-                                              duration=len(x)/16000 + .74,  # sound duration in seconds
                                               ).detach().cpu().numpy()
-        x = .5 * x + .47 * background[:len(x)]
     return x
 soundfile.write(f'demo.wav', tts_entry(), 16000)

 import numpy as np
 import soundfile
+import msinference  # api.py has also split into sentences for OOM
 from audiocraft.builders import AudioGen
+def tts_entry(text='A quick brown fox jumps over the lazy dog. Sweet dreams are made of this, I traveled the world and the seven seas.',
+              voice='en_US/m-ailabs_low#mary_ann', #fr_FR_m-ailabs_bernard', #'deu', #'serbian', #'romanian', #'deu', #'en_US/vctk_low#p326', #'en_US/vctk_low#p276',  # 'deu', 'af_ZA_google-nwu_1919', 'serbian', 'isl',
+              soundscape = 'birds river'):
+    '''voice = 'en_US/vctk_low#p276'       # Native English Voices     > https://audeering.github.io/shift/
+             = 'af_ZA_google-nwu_1919'     # Non Native English Voices > https://huggingface.co/dkounadis/artificial-styletts2/discussions/1#6783e3b00e7d90facec060c6
+             = 'deu'                       # Other languages           > https://huggingface.co/dkounadis/artificial-styletts2/blob/main/Utils/all_langs.csv
        '''
     if ('en_US/' in voice) or ('en_UK/' in voice):
+        style_vector = msinference.compute_style('assets/wavs/style_vector/' + voice.replace(
                                                 '/', '_').replace('#', '_').replace(
                                                     'cmu-arctic', 'cmu_arctic').replace(
                                                         '_low', '') + '.wav')
+        x = msinference.inference(text, style_vector)
     elif '_' in  voice:
         style_vector = msinference.compute_style('assets/wavs/mimic3_foreign_4x/' + voice.replace(
                                                 '/', '_').replace('#', '_').replace(
                                                     'cmu-arctic', 'cmu_arctic').replace(
                                                         '_low', '') + '.wav')
+        x = msinference.inference(text, style_vector)
     else:
+        x = msinference.foreign(text=text, lang=voice)
+    x /= 1.02 * np.abs(x).max() + 1e-7  # volume amplify full [-1,1]
     if soundscape is not None:
         sound_gen = AudioGen().to('cuda:0').eval()
+        background = sound_gen.generate(soundscape, duration=len(x)/16000 + .74,  # sound duration seconds
                                               ).detach().cpu().numpy()
+        x = .6 * x + .4 * background[:len(x)]
     return x
 soundfile.write(f'demo.wav', tts_entry(), 16000)

live_demo.py CHANGED Viewed

@@ -16,7 +16,6 @@ def send_to_server(args):
         'affective': True,
         'image': None,
         'video': None,
-        'speed': 1.14,
         'native': None,
     }
@@ -24,16 +23,15 @@ def send_to_server(args):
 args = SimpleNamespace()
-args.voice = 'fr_FR_m-ailabs_bernard'  # 'en_US/m-ailabs_low#judy_bieber'
-args.speed = 1.14
 os.system('cls' if os.name == 'nt' else 'clear')
 while True:
     _str = input("\n\n\n\nDescribe Any Sound: \n\n\n\n")
-    _str += 'Lorem ipsum dolor sit amet, consetetur elixir sed diam nonumy eirmod tempor invidunt labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Soutet clita kasd gubergren.'
     args.soundscape = _str
     args.text = '_tmp.txt'  # input -> .txt (implementation thought for audiobooks in API)
     with open(args.text, 'w') as f:

         'affective': True,
         'image': None,
         'video': None,
         'native': None,
     }
 args = SimpleNamespace()
+args.voice = 'en_US/m-ailabs_low#judy_bieber'
 os.system('cls' if os.name == 'nt' else 'clear')
 while True:
     _str = input("\n\n\n\nDescribe Any Sound: \n\n\n\n")
     args.soundscape = _str
+    _str += 'A quick brown fox jumps over the lazy dog. Sweet dreams are made of this, I traveled the world and the seven seas.'
     args.text = '_tmp.txt'  # input -> .txt (implementation thought for audiobooks in API)
     with open(args.text, 'w') as f:

models.py CHANGED Viewed

@@ -1,93 +1,98 @@
-#coding:utf-8
 import os
-import math
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from torch.nn.utils import weight_norm, spectral_norm
 # from Utils.ASR.models import ASRCNN
-from Utils.JDC.model import JDCNet
 from Modules.hifigan import _tile, AdainResBlk1d
-import yaml
-class LearnedDownSample(nn.Module):
-    def __init__(self, layer_type, dim_in):
         super().__init__()
-        self.layer_type = layer_type
-        if self.layer_type == 'none':
-            raise ValueError
-            # self.conv = nn.Identity()
-        elif self.layer_type == 'timepreserve':
-            raise ValueError
-            # self.conv = spectral_norm(nn.Conv2d(dim_in, dim_in, kernel_size=(3, 1), stride=(2, 1), groups=dim_in, padding=(1, 0)))
-        elif self.layer_type == 'half':
-            self.conv = spectral_norm(nn.Conv2d(dim_in, dim_in, kernel_size=(3, 3), stride=(2, 2), groups=dim_in, padding=1))
-        else:
-            raise RuntimeError('Got unexpected donwsampletype %s, expected is [none, timepreserve, half]' % self.layer_type)
     def forward(self, x):
-        return self.conv(x)
-class DownSample(nn.Module):
-    def __init__(self, layer_type):
         super().__init__()
-        self.layer_type = layer_type
     def forward(self, x):
-        if self.layer_type == 'none':
-            return x
-        elif self.layer_type == 'timepreserve':
-            return F.avg_pool2d(x, (2, 1))
-        elif self.layer_type == 'half':
-            if x.shape[-1] % 2 != 0:
-                x = torch.cat([x, x[..., -1].unsqueeze(-1)], dim=-1)
-            return F.avg_pool2d(x, 2)
-        else:
-            raise RuntimeError('Got unexpected donwsampletype %s, expected is [none, timepreserve, half]' % self.layer_type)
 class ResBlk(nn.Module):
-    def __init__(self, dim_in, dim_out, actv=nn.LeakyReLU(0.2),
-                 normalize=False, downsample='none'):
         super().__init__()
-        self.actv = actv
-        self.normalize = normalize
-        self.downsample = DownSample(downsample)
-        self.downsample_res = LearnedDownSample(downsample, dim_in)
         self.learned_sc = dim_in != dim_out
-        self._build_weights(dim_in, dim_out)
-    def _build_weights(self, dim_in, dim_out):
         self.conv1 = spectral_norm(nn.Conv2d(dim_in, dim_in, 3, 1, 1))
         self.conv2 = spectral_norm(nn.Conv2d(dim_in, dim_out, 3, 1, 1))
-        if self.normalize:
-            self.norm1 = nn.InstanceNorm2d(dim_in, affine=True)
-            self.norm2 = nn.InstanceNorm2d(dim_in, affine=True)
         if self.learned_sc:
-            self.conv1x1 = spectral_norm(nn.Conv2d(dim_in, dim_out, 1, 1, 0, bias=False))
     def _shortcut(self, x):
         if self.learned_sc:
             x = self.conv1x1(x)
-        if self.downsample:
-            x = self.downsample(x)
-        return x
     def _residual(self, x):
-        if self.normalize:
-            x = self.norm1(x)
         x = self.actv(x)
         x = self.conv1(x)
         x = self.downsample_res(x)
-        if self.normalize:
-            x = self.norm2(x)
         x = self.actv(x)
         x = self.conv2(x)
         return x
@@ -101,113 +106,41 @@ class StyleEncoder(nn.Module):
     #  for both acoustic & prosodic ref_s/p
-    def __init__(self, dim_in=48, style_dim=48, max_conv_dim=384):
         super().__init__()
-        blocks = []
-        blocks += [spectral_norm(nn.Conv2d(1, dim_in, 3, 1, 1))]
-        repeat_num = 4
-        for _ in range(repeat_num):
-            dim_out = min(dim_in*2, max_conv_dim)
-            blocks += [ResBlk(dim_in, dim_out, downsample='half')]
             dim_in = dim_out
-        blocks += [nn.LeakyReLU(0.2)]
-        blocks += [spectral_norm(nn.Conv2d(dim_out, dim_out, 5, 1, 0))]
-        # blocks += [nn.AdaptiveAvgPool2d(1)]   # THIS AVERAGES THE TIME-FRAMES OF SPEAKER STYLE
-        blocks += [nn.LeakyReLU(0.2)]
         self.shared = nn.Sequential(*blocks)
         self.unshared = nn.Linear(dim_out, style_dim)
     def forward(self, x):
-        h = self.shared(x)  # [bs, 512, 1, 11]
-        h = h.mean(3, keepdims=True)  # UN COMMENT FOR TIME INVARIANT GLOBAL SPEAKER STYLE
-        # h = .7 * h + .25 * h.mean(3, keepdims=True)
-        h = h.transpose(1, 3)
-        s = self.unshared(h)
         return s
 class LinearNorm(torch.nn.Module):
-    def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'):
-        super(LinearNorm, self).__init__()
         self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias)
-        torch.nn.init.xavier_uniform_(
-            self.linear_layer.weight,
-            gain=torch.nn.init.calculate_gain(w_init_gain))
     def forward(self, x):
         return self.linear_layer(x)
-class ResBlk1d(nn.Module):
-    def __init__(self, dim_in, dim_out, actv=nn.LeakyReLU(0.2),
-                 normalize=False, downsample='none', dropout_p=0.2):
-        super().__init__()
-        self.actv = actv
-        self.normalize = normalize
-        self.downsample_type = downsample
-        self.learned_sc = dim_in != dim_out
-        self._build_weights(dim_in, dim_out)
-        self.dropout_p = dropout_p
-        if self.downsample_type == 'none':
-            self.pool = nn.Identity()
-        else:
-            self.pool = weight_norm(nn.Conv1d(dim_in, dim_in, kernel_size=3, stride=2, groups=dim_in, padding=1))
-    def _build_weights(self, dim_in, dim_out):
-        self.conv1 = weight_norm(nn.Conv1d(dim_in, dim_in, 3, 1, 1))
-        self.conv2 = weight_norm(nn.Conv1d(dim_in, dim_out, 3, 1, 1))
-        if self.normalize:
-            self.norm1 = nn.InstanceNorm1d(dim_in, affine=True)
-            self.norm2 = nn.InstanceNorm1d(dim_in, affine=True)
-        if self.learned_sc:
-            self.conv1x1 = weight_norm(nn.Conv1d(dim_in, dim_out, 1, 1, 0, bias=False))
-    def downsample(self, x):
-        if self.downsample_type == 'none':
-            return x
-        else:
-            if x.shape[-1] % 2 != 0:
-                x = torch.cat([x, x[..., -1].unsqueeze(-1)], dim=-1)
-            return F.avg_pool1d(x, 2)
-    def _shortcut(self, x):
-        if self.learned_sc:
-            x = self.conv1x1(x)
-        x = self.downsample(x)
-        return x
-    def _residual(self, x):
-        if self.normalize:
-            x = self.norm1(x)
-        x = self.actv(x)
-        x = F.dropout(x, p=self.dropout_p, training=self.training)
-        x = self.conv1(x)
-        x = self.pool(x)
-        if self.normalize:
-            x = self.norm2(x)
-        x = self.actv(x)
-        x = F.dropout(x, p=self.dropout_p, training=self.training)
-        x = self.conv2(x)
-        return x
-    def forward(self, x):
-        x = self._shortcut(x) + self._residual(x)
-        return x / math.sqrt(2)  # unit variance
 class LayerNorm(nn.Module):
     def __init__(self, channels, eps=1e-5):
         super().__init__()
@@ -222,168 +155,151 @@ class LayerNorm(nn.Module):
         x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
         return x.transpose(1, -1)
 class TextEncoder(nn.Module):
-    def __init__(self, channels, kernel_size, depth, n_symbols, actv=nn.LeakyReLU(0.2)):
         super().__init__()
         self.embedding = nn.Embedding(n_symbols, channels)
         padding = (kernel_size - 1) // 2
         self.cnn = nn.ModuleList()
         for _ in range(depth):
             self.cnn.append(nn.Sequential(
                 weight_norm(nn.Conv1d(channels, channels, kernel_size=kernel_size, padding=padding)),
                 LayerNorm(channels),
-                actv,
-                nn.Dropout(0.2),
-            ))
-        # self.cnn = nn.Sequential(*self.cnn)
-        self.lstm = nn.LSTM(channels, channels//2, 1, batch_first=True, bidirectional=True)
-    def forward(self, x, input_lengths):
         x = self.embedding(x)  # [B, T, emb]
-        x = x.transpose(1, 2)  # [B, emb, T]
         for c in self.cnn:
-            x = c(x)
-        x = x.transpose(1, 2)  # [B, T, chn]
-        input_lengths = input_lengths.cpu().numpy()
-        x = nn.utils.rnn.pack_padded_sequence(
-            x, input_lengths,
-            batch_first=True,
-            enforce_sorted=False)
-        self.lstm.flatten_parameters()
         x, _ = self.lstm(x)
-        x, _ = nn.utils.rnn.pad_packed_sequence(
-            x, batch_first=True)
-        x = x.transpose(-1, -2)
         return x
 class AdaLayerNorm(nn.Module):
-    # only instantianted in DurationPredictor()
     def __init__(self, style_dim, channels=None, eps=1e-5):
         super().__init__()
         self.eps = eps
         self.fc = nn.Linear(style_dim, 1024)
     def forward(self, x, s):
-        h = self.fc(s.transpose(1, 2))  # has to be transposed due to interpolate needing the last dim to be frames
         gamma = h[:, :, :512]
         beta = h[:, :, 512:1024]
-        x = F.layer_norm(x.transpose(1, 2), (512, ), eps=self.eps)
         x = (1 + gamma) * x + beta
         return x  # [1, 75, 512]
 class ProsodyPredictor(nn.Module):
-    def __init__(self, style_dim, d_hid, nlayers, max_dur=50, dropout=0.1):
-        super().__init__()
-        self.text_encoder = DurationEncoder(sty_dim=style_dim,
-                                            d_model=d_hid,
-                                            nlayers=nlayers,
-                                            dropout=dropout)
-        self.lstm = nn.LSTM(d_hid + style_dim, d_hid // 2, 1, batch_first=True, bidirectional=True)
         self.duration_proj = LinearNorm(d_hid, max_dur)
-        self.shared = nn.LSTM(d_hid + style_dim, d_hid // 2, 1, batch_first=True, bidirectional=True)
-        self.F0 = nn.ModuleList()
-        self.F0.append(AdainResBlk1d(d_hid, d_hid, style_dim, dropout_p=dropout))
-        self.F0.append(AdainResBlk1d(d_hid, d_hid // 2, style_dim, upsample=True, dropout_p=dropout))
-        self.F0.append(AdainResBlk1d(d_hid // 2, d_hid // 2, style_dim, dropout_p=dropout))
-        self.N = nn.ModuleList()
-        self.N.append(AdainResBlk1d(d_hid, d_hid, style_dim, dropout_p=dropout))
-        self.N.append(AdainResBlk1d(d_hid, d_hid // 2, style_dim, upsample=True, dropout_p=dropout))
-        self.N.append(AdainResBlk1d(d_hid // 2, d_hid // 2, style_dim, dropout_p=dropout))
         self.F0_proj = nn.Conv1d(d_hid // 2, 1, 1, 1, 0)
         self.N_proj = nn.Conv1d(d_hid // 2, 1, 1, 1, 0)
     def F0Ntrain(self, x, s):
-        x, _ = self.shared(x.transpose(1, 2))  # [bs, time, ch] LSTM
         x = x.transpose(1, 2)  # [bs, ch, time]
         F0 = x
         for block in self.F0:
             # print(f'LOOP {F0.shape=} {s.shape=}\n')
             # )N F0.shape=torch.Size([1, 512, 147]) s.shape=torch.Size([1, 128])
-            F0 = block(F0, s)  # This is an AdainResBlk1d expects conv1d dimensions
         F0 = self.F0_proj(F0)
         N = x
         for block in self.N:
             N = block(N, s)
         N = self.N_proj(N)
         return F0, N
 class DurationEncoder(nn.Module):
-    def __init__(self, sty_dim, d_model, nlayers, dropout=0.1):
         super().__init__()
         self.lstms = nn.ModuleList()
         for _ in range(nlayers):
-            self.lstms.append(nn.LSTM(d_model + sty_dim,
-                                 d_model // 2,
-                                 num_layers=1,
-                                 batch_first=True,
-                                 bidirectional=True,
-                                 dropout=dropout))
             self.lstms.append(AdaLayerNorm(sty_dim, d_model))
-        self.dropout = dropout
-        self.d_model = d_model
-        self.sty_dim = sty_dim
-    def forward(self, x, style, text_lengths):
-        # style = style[:, :, 0, :].transpose(2, 1)  # [bs, 128, 11]
-        style = _tile(style, length=x.shape[2])  # replicate style vector to duration of txt - F.interpolate or cyclic/tile
-        x = torch.cat([x, style], axis=1)  # [bs, 640, 75]
-        input_lengths = text_lengths.cpu().numpy()
         for block in self.lstms:
             if isinstance(block, AdaLayerNorm):
-                # not LST enters here
-                x = block(x, style)   # [bs, 75, 512]
-                x = torch.cat([x.transpose(1, 2), style], axis=1) # [bs, 512, 75]
             else:
-                # print(f'{x.shape=} ENTER LSTM')  # [bs, 640, 75]  LSTM reduce ch 640 -> 512
-                x = x.transpose(-1, -2)
-                x = nn.utils.rnn.pack_padded_sequence(
-                    x, input_lengths, batch_first=True, enforce_sorted=False)
-                block.flatten_parameters()
-                x, _ = block(x)
-                x, _ = nn.utils.rnn.pad_packed_sequence(
-                    x, batch_first=True)
-                x = F.dropout(x, p=self.dropout, training=self.training)
-                x = x.transpose(-1, -2)
-        return x.transpose(-1, -2)
-def load_F0_models(path):
-    # load F0 model
-    F0_model = JDCNet(num_class=1, seq_len=192)
-    path = path.replace('.t7', '.pth')
-    params = torch.load(path, map_location='cpu')['net']
-    F0_model.load_state_dict(params)
-    _ = F0_model.train()
-    return F0_model

+# coding:utf-8
 import os
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from torch.nn.utils import spectral_norm
+from torch.nn.utils.parametrizations import weight_norm
 # from Utils.ASR.models import ASRCNN
+# from Utils.JDC.model import JDCNet
 from Modules.hifigan import _tile, AdainResBlk1d
+import math
+class MelSpec(torch.nn.Module):
+    def __init__(self,
+                 sample_rate=17402, # https://github.com/fakerybakery/styletts2-cli/blob/main/msinference.py = Default 16000. However 17400 vocalises better also "en_US/vctk_p274"
+                 n_fft=2048,
+                 win_length=1200,
+                 hop_length=300,
+                 n_mels=80
+                 ):
+        '''avoids dependency on torchaudio'''
         super().__init__()
+        self.n_fft = n_fft
+        self.win_length = win_length if win_length is not None else n_fft
+        self.hop_length = hop_length if hop_length is not None else self.win_length // 2
+        # --
+        f_min = 0.0
+        f_max = float(sample_rate // 2)
+        all_freqs = torch.linspace(0, sample_rate // 2, n_fft//2+1)
+        m_min = 2595.0 * math.log10(1.0 + (f_min / 700.0))
+        m_max = 2595.0 * math.log10(1.0 + (f_max / 700.0))
+        m_pts = torch.linspace(m_min, m_max, n_mels + 2)
+        f_pts = 700.0 * (10 ** (m_pts / 2595.0) - 1.0)
+        f_diff = f_pts[1:] - f_pts[:-1]  # (n_mels + 1)
+        slopes = f_pts.unsqueeze(0) - all_freqs.unsqueeze(1)
+        zero = torch.zeros(1)
+        down_slopes = (-1.0 * slopes[:, :-2]) / f_diff[:-1]  # (n_freqs, n_mels)
+        up_slopes = slopes[:, 2:] / f_diff[1:]  # (n_freqs, n_mels)
+        fb = torch.max(zero, torch.min(down_slopes, up_slopes))
+        # --
+        self.register_buffer('fb', fb)
+        window = torch.hann_window(self.win_length)
+        self.register_buffer('window', window)
     def forward(self, x):
+        spec_f = torch.stft(x,
+                            self.n_fft,
+                            self.hop_length,
+                            self.win_length,
+                            self.window,
+                            center=True,
+                            pad_mode="reflect",
+                            normalized=False,
+                            onesided=True,
+                            return_complex=True)  # [bs, 1025, 56]
+        mel_specgram = torch.matmul(spec_f.abs().pow(2).transpose(1, 2), self.fb).transpose(1, 2)
+        return mel_specgram[:, None, :, :]  # [bs, 1, 80, time]
+class LearnedDownSample(nn.Module):
+    def __init__(self, dim_in):
         super().__init__()
+        self.conv = spectral_norm(nn.Conv2d(dim_in, dim_in, kernel_size=(
+                3, 3), stride=(2, 2), groups=dim_in, padding=1))
     def forward(self, x):
+        return self.conv(x)
 class ResBlk(nn.Module):
+    def __init__(self,
+                 dim_in, dim_out):
         super().__init__()
+        self.actv = nn.LeakyReLU(0.2)   # .07 also nice
+        self.downsample_res = LearnedDownSample(dim_in)
         self.learned_sc = dim_in != dim_out
         self.conv1 = spectral_norm(nn.Conv2d(dim_in, dim_in, 3, 1, 1))
         self.conv2 = spectral_norm(nn.Conv2d(dim_in, dim_out, 3, 1, 1))
         if self.learned_sc:
+            self.conv1x1 = spectral_norm(
+                nn.Conv2d(dim_in, dim_out, 1, 1, 0, bias=False))
     def _shortcut(self, x):
         if self.learned_sc:
             x = self.conv1x1(x)
+        if x.shape[3] % 2 != 0:  # [bs, 128, Freq, Time]
+            x = torch.cat([x, x[:, :, :, -1:]], dim=3)
+        return F.interpolate(x, scale_factor=.5, mode='nearest-exact')  # F.avg_pool2d(x, 2)
     def _residual(self, x):
         x = self.actv(x)
         x = self.conv1(x)
         x = self.downsample_res(x)
         x = self.actv(x)
         x = self.conv2(x)
         return x
     #  for both acoustic & prosodic ref_s/p
+    def __init__(self,
+                 dim_in=64,
+                 style_dim=128,
+                 max_conv_dim=512):
         super().__init__()
+        blocks = [spectral_norm(nn.Conv2d(1, dim_in, 3, stride=1, padding=1))]
+        for _ in range(4):
+            dim_out = min(dim_in * 2,
+                          max_conv_dim)
+            blocks += [ResBlk(dim_in, dim_out)]
             dim_in = dim_out
+        blocks += [nn.LeakyReLU(0.24),  # w/o this activation - produces no speech
+                   spectral_norm(nn.Conv2d(dim_out, dim_out, 5, stride=1, padding=0)),
+                   nn.LeakyReLU(0.2)  # 0.3 sounds nice
+                   ]
         self.shared = nn.Sequential(*blocks)
         self.unshared = nn.Linear(dim_out, style_dim)
     def forward(self, x):
+        x = self.shared(x)
+        x = x.mean(3, keepdims=True)  # comment this line for time varying style vector
+        x = x.transpose(1, 3)
+        s = self.unshared(x)
         return s
 class LinearNorm(torch.nn.Module):
+    def __init__(self, in_dim, out_dim, bias=True):
+        super().__init__()
         self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias)
     def forward(self, x):
         return self.linear_layer(x)
 class LayerNorm(nn.Module):
     def __init__(self, channels, eps=1e-5):
         super().__init__()
         x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
         return x.transpose(1, -1)
 class TextEncoder(nn.Module):
+    def __init__(self, channels, kernel_size, depth, n_symbols):
         super().__init__()
         self.embedding = nn.Embedding(n_symbols, channels)
         padding = (kernel_size - 1) // 2
         self.cnn = nn.ModuleList()
         for _ in range(depth):
             self.cnn.append(nn.Sequential(
                 weight_norm(nn.Conv1d(channels, channels, kernel_size=kernel_size, padding=padding)),
                 LayerNorm(channels),
+                nn.LeakyReLU(0.24))
+                            )
+        self.lstm = nn.LSTM(channels, channels//2, 1,
+                            batch_first=True, bidirectional=True)
+    def forward(self, x):
         x = self.embedding(x)  # [B, T, emb]
+        x = x.transpose(1, 2)
         for c in self.cnn:
+            x = c(x)
+        x = x.transpose(1, 2)
         x, _ = self.lstm(x)
         return x
 class AdaLayerNorm(nn.Module):
     def __init__(self, style_dim, channels=None, eps=1e-5):
         super().__init__()
         self.eps = eps
         self.fc = nn.Linear(style_dim, 1024)
     def forward(self, x, s):
+        h = self.fc(s)
         gamma = h[:, :, :512]
         beta = h[:, :, 512:1024]
+        x = F.layer_norm(x, (512, ), eps=self.eps)
         x = (1 + gamma) * x + beta
         return x  # [1, 75, 512]
 class ProsodyPredictor(nn.Module):
+    def __init__(self, style_dim, d_hid, nlayers, max_dur=50):
+        super().__init__()
+        self.text_encoder = DurationEncoder(sty_dim=style_dim,
+                                            d_model=d_hid,
+                                            nlayers=nlayers)  # called outside forward
+        self.lstm = nn.LSTM(d_hid + style_dim, d_hid // 2,
+                            1, batch_first=True, bidirectional=True)
         self.duration_proj = LinearNorm(d_hid, max_dur)
+        self.shared = nn.LSTM(d_hid + style_dim, d_hid //
+                              2, 1, batch_first=True, bidirectional=True)
+        self.F0 = nn.ModuleList([
+            AdainResBlk1d(d_hid, d_hid, style_dim),
+            AdainResBlk1d(d_hid, d_hid // 2,  style_dim, upsample=True),
+            AdainResBlk1d(d_hid // 2, d_hid // 2, style_dim),
+            ])
+        self.N = nn.ModuleList([
+            AdainResBlk1d(d_hid, d_hid, style_dim),
+            AdainResBlk1d(d_hid, d_hid // 2, style_dim, upsample=True),
+            AdainResBlk1d(d_hid // 2, d_hid // 2, style_dim)
+            ])
         self.F0_proj = nn.Conv1d(d_hid // 2, 1, 1, 1, 0)
         self.N_proj = nn.Conv1d(d_hid // 2, 1, 1, 1, 0)
     def F0Ntrain(self, x, s):
+        x, _ = self.shared(x)  # [bs, time, ch] LSTM
         x = x.transpose(1, 2)  # [bs, ch, time]
         F0 = x
         for block in self.F0:
             # print(f'LOOP {F0.shape=} {s.shape=}\n')
             # )N F0.shape=torch.Size([1, 512, 147]) s.shape=torch.Size([1, 128])
+            # This is an AdainResBlk1d expects conv1d dimensions
+            F0 = block(F0, s)
         F0 = self.F0_proj(F0)
         N = x
         for block in self.N:
             N = block(N, s)
         N = self.N_proj(N)
         return F0, N
+    def forward(self, d_en=None, s=None):
+        blend = self.text_encoder(d_en, s)
+        x, _ = self.lstm(blend)
+        dur = self.duration_proj(x)  # [bs, 150, 50]
+        _, input_length, classifier_50 = dur.shape
+        dur = dur[0, :, :]
+        dur = torch.sigmoid(dur).sum(1)
+        dur = dur.round().clamp(min=1).to(torch.int64)
+        aln_trg = torch.zeros(1,
+                              dur.sum(),
+                              input_length,
+                              device=s.device)
+        c_frame = 0
+        for i in range(input_length):
+            aln_trg[:, c_frame:c_frame + dur[i], i] = 1
+            c_frame += dur[i]
+        en = torch.bmm(aln_trg, blend)
+        F0_pred, N_pred = self.F0Ntrain(en, s)
+        return aln_trg, F0_pred, N_pred
 class DurationEncoder(nn.Module):
+    def __init__(self, sty_dim=128, d_model=512, nlayers=3):
         super().__init__()
         self.lstms = nn.ModuleList()
         for _ in range(nlayers):
+            self.lstms.append(nn.LSTM(d_model + sty_dim,
+                                      d_model // 2,
+                                      num_layers=1,
+                                      batch_first=True,
+                                      bidirectional=True
+                                      ))
             self.lstms.append(AdaLayerNorm(sty_dim, d_model))
+    def forward(self, x, style):
+        _, _, input_lengths = x.shape  # [bs, 512, time]
+        style = _tile(style, length=x.shape[2]).transpose(1, 2)
+        x = x.transpose(1, 2)
         for block in self.lstms:
             if isinstance(block, AdaLayerNorm):
+                x = block(x, style)  # LSTM has transposed x
             else:
+                x = torch.cat([x, style], axis=2)
+                # LSTM
+                x,_ = block(x)  # expects [bs, time, chan]  OUTPUTS [bs, time, 2*chan]  2x FROM BIDIRECTIONAL
+        return torch.cat([x, style], axis=2)  # predictor.lstm()

msinference.py CHANGED Viewed

@@ -3,25 +3,28 @@ import sys
 import tempfile
 import re
 import os
-from num2words import num2words
 from collections import OrderedDict
 from Modules.hifigan import Decoder
 from Utils.PLBERT.util import load_plbert
 import phonemizer
 import torch
 from cached_path import cached_path
-# import nltk
 import audresample
-# nltk.download('punkt')
 import numpy as np
 import yaml
-import torchaudio
 import librosa
-from models import ProsodyPredictor, TextEncoder, StyleEncoder, load_F0_models
 from nltk.tokenize import word_tokenize
 from Utils.text_utils import transliterate_number
 import textwrap
-# IPA Phonemizer: https://github.com/bootphon/phonemizer
 _pad = "$"
 _punctuation = ';:,.!?¡¿—…"«»“” '
@@ -56,45 +59,33 @@ class TextCleaner:
 textclenaer = TextCleaner()
-to_mel = torchaudio.transforms.MelSpectrogram(
-    n_mels=80, n_fft=2048, win_length=1200, hop_length=300)
-mean, std = -4, 4
 def alpha_num(f):
     f = re.sub(' +', ' ', f)              # delete spaces
     f = re.sub(r'[^A-Z a-z0-9 ]+', '', f)  # del non alpha num
     return f
-def preprocess(wave):
-    wave_tensor = torch.from_numpy(wave).float()
-    mel_tensor = to_mel(wave_tensor)
-    mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - mean) / std
-    return mel_tensor
 def compute_style(path):
-    wave, sr = librosa.load(path, sr=24000)
-    audio, index = librosa.effects.trim(wave, top_db=30)
     if sr != 24000:
-        audio = librosa.resample(audio, sr, 24000)
-    mel_tensor = preprocess(audio).to(device)
     with torch.no_grad():
-        ref_s = style_encoder(mel_tensor.unsqueeze(1))
-        ref_p = predictor_encoder(mel_tensor.unsqueeze(1))  # [bs, 11, 1, 128]
-    s = torch.cat([ref_s, ref_p], dim=3)  # [bs, 11, 1, 256]
-    s = s[:, :, 0, :].transpose(1, 2)  # [1, 128, 11]
-    return s  # [1, 128, 11]
-device = 'cpu'
-if torch.cuda.is_available():
-    device = 'cuda'
 global_phonemizer = phonemizer.backend.EspeakBackend(
     language='en-us', preserve_punctuation=True,  with_stress=True)
@@ -104,10 +95,6 @@ global_phonemizer = phonemizer.backend.EspeakBackend(
 args = yaml.safe_load(open(str('Utils/config.yml')))
 ASR_config = args['ASR_config']
-F0_path = args['F0_path']
-pitch_extractor = load_F0_models(F0_path).eval().to(device)
 bert = load_plbert(args['PLBERT_dir']).eval().to(device)
 decoder = Decoder(dim_in=512,
@@ -128,8 +115,7 @@ text_encoder = TextEncoder(channels=512,
 predictor = ProsodyPredictor(style_dim=128,
                              d_hid=512,
                              nlayers=3,  # OFFICIAL config.nlayers=5;
-                             max_dur=50,
-                             dropout=.2).eval().to(device)
 style_encoder = StyleEncoder(dim_in=64,
                              style_dim=128,
@@ -141,9 +127,10 @@ bert_encoder = torch.nn.Linear(bert.config.hidden_size, 512).eval().to(device)
 # params_whole = torch.load('freevc2/yl4579_styletts2.pth' map_location='cpu')
 params_whole = torch.load(str(cached_path(
-    "hf://yl4579/StyleTTS2-LibriTTS/Models/LibriTTS/epochs_2nd_00020.pth")), map_location='cpu')
 params = params_whole['net']
 def _del_prefix(d):
     # del ".module"
@@ -163,95 +150,41 @@ predictor_encoder.load_state_dict(_del_prefix(
     params['predictor_encoder']), strict=True)
 style_encoder.load_state_dict(_del_prefix(
     params['style_encoder']), strict=True)
-pitch_extractor.load_state_dict(_del_prefix(
-    params['pitch_extractor']), strict=True)
-# def _shift(x):
-#     # [bs, samples] shift circular each batch elem of sound
-#     n = x.shape[1]
-#     for i, batch_elem in enumerate(x):
-#         offset = np.random.randint(.24 * n, max(1, .74 * n))  # high should be above >= 0 TBD
-#         x[i, ...] = torch.roll(batch_elem, offset, dims=1)  # batch_elem = [400000, ]
-#     return x
 def inference(text,
-              ref_s,
-              use_gruut=False):
-    text = transliterate_number(text, lang='en').strip()
     ps = global_phonemizer.phonemize([text])
-    # print(f'PHONEMIZER: {ps=}\n\n') #PHONEMIZER: ps=['ɐbˈɛbæbləm ']
     ps = word_tokenize(ps[0])
-    # # print(f'TOKENIZER: {ps=}\n\n') #OKENIZER: ps=['ɐbˈɛbæbləm']
     ps = ' '.join(ps)
     tokens = textclenaer(ps)
-    # print(f'TEXTCLEAN: {ps=}\n\n') #TEXTCLEAN: ps='ɐbˈɛbæbləm'
     tokens.insert(0, 0)
     tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)
-    # print(f'TOKENSFINAL: {ps=}\n\n')
     with torch.no_grad():
-        input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
-        hidden_states = text_encoder(tokens, input_lengths)
-        bert_dur = bert(tokens, attention_mask=None)
         d_en = bert_encoder(bert_dur).transpose(-1, -2)
-        ref = ref_s[:, :128, :]  # [bs, 128, 11]
-        s = ref_s[:, 128:, :]
-        d = predictor.text_encoder(d_en, s, input_lengths)
-        d = d.transpose(1, 2)
-        # -------------------------------- pred_aln_trg = clones bert frames as duration
-        d = predictor.text_encoder(d_en,
-                                   s,
-                                   input_lengths)
-        x, _ = predictor.lstm(d)
-        duration = predictor.duration_proj(x)
-        duration = torch.sigmoid(duration).sum(axis=-1)
-        pred_dur = torch.round(duration.squeeze()).clamp(min=1)
-        pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))
-        c_frame = 0
-        for i in range(pred_aln_trg.size(0)):
-            pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1
-            c_frame += int(pred_dur[i].data)
-        en = (d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device))
-        asr_new = torch.zeros_like(en)
-        asr_new[:, :, 0] = en[:, :, 0]
-        asr_new[:, :, 1:] = en[:, :, 0:-1]
-        en = asr_new
-        F0_pred, N_pred = predictor.F0Ntrain(en, s)
-        asr = (hidden_states @ pred_aln_trg.unsqueeze(0).to(device))
-        asr_new = torch.zeros_like(asr)
-        asr_new[:, :, 0] = asr[:, :, 0]
-        asr_new[:, :, 1:] = asr[:, :, 0:-1]
-        asr = asr_new
-        # -
-        x = decoder(asr=asr,
-                    F0_curve=F0_pred,
-                    N=N_pred,
-                    s=ref)
-    x = x.cpu().numpy()[0, 0, :-400]  # weird pulse at the end of sentences
-    # StyleTTS2 is 24kHz -> Resample to 16kHz ofAudioGen / MMS
     if x.shape[0] > 10:
-        x /= np.abs(x).max() + 1e-7
         x = audresample.resample(signal=x.astype(np.float32),
                                  original_rate=24000,
-                                 target_rate=16000)[0, :]  # reshapes (64,) -> (1,64)
     else:
         print('\n\n\n\n\nEMPTY TTS\n\n\n\n\n\nn', x.shape)
@@ -346,17 +279,14 @@ def foreign(text=None,   # split sentences here so we can prepend a txt for germ
     elif 'rom' in lang:
         lang_code = 'ron'
-        speed = 1.24 if speed is None else speed
-    elif 'ger' in lang:
         lang_code = 'deu'
-        speed = 1.14 if speed is None else speed
     elif 'alban' in lang:
         lang_code = 'sqi'
-        speed = 1.04 if speed is None else speed
     else:
@@ -364,38 +294,38 @@ def foreign(text=None,   # split sentences here so we can prepend a txt for germ
     # load VITS
-    net_g = VitsModel.from_pretrained(f'facebook/mms-tts-{lang_code}').eval().to(device)
-    tokenizer = VitsTokenizer.from_pretrained(f'facebook/mms-tts-{lang_code}')
     total_audio = []
     # Split long sentences if deu to control voice switch - for other languages let text no-split
     if not isinstance(text, list):
-        if lang_code == 'deu':
-            # Split Very long sentences >500 phoneme - StyleTTS2 crashes # -- even 400 phonemes sometimes OOM in cuda:4
-            # However prosody is nicer on non-split for MMS TTS
-            # prepend txt snippet
-            text = [
-                sub_sent+' ' for sub_sent in textwrap.wrap(text, 200, break_long_words=0)]
-            # assert that it chooses unique voice
-        else:
-            # allow longer non split text
-            text = [
-                sub_sent+' ' for sub_sent in textwrap.wrap(text, 640, break_long_words=0)]
-            # for non deu MMS TTS lang.
     for _t in text:
         _t = _t.lower()
-        # apply this in api.py -> tts_multi_sentence before switching between Styletts2
-        print('\n\n\n\nBEF TRansliteration', _t,'\n\n\n\n\n')
-        _t = transliterate_number(_t, lang=lang_code)
-        print('AFT nums', _t,'\n____________________________________________')
-        # However if we transliterate here also the demo sees the transliteration
         if lang_code == 'rmc-script_latin':
@@ -417,7 +347,7 @@ def foreign(text=None,   # split sentences here so we can prepend a txt for germ
             x = net_g(input_ids=inputs.input_ids.to(device),
                       attention_mask=inputs.attention_mask.to(device),
-                      speed=speed + .44 * np.random.rand()  # variable speed for different sentence
                       )[0, :]
             # crop the 1st audio - is PREFIX text 156000 samples to chose deu voice / VitsAttention()
@@ -428,8 +358,6 @@ def foreign(text=None,   # split sentences here so we can prepend a txt for germ
     x = torch.cat(total_audio).cpu().numpy()
-    x /= np.abs(x).max() + 1e-7
-    # print(x.shape, x.min(), x.max(), hps.data.sampling_rate)
     return x  # 16kHz - only resample  StyleTTS2 from 24Hkz -> 16kHz

 import tempfile
 import re
 import os
 from collections import OrderedDict
 from Modules.hifigan import Decoder
 from Utils.PLBERT.util import load_plbert
 import phonemizer
 import torch
 from cached_path import cached_path
+import nltk
 import audresample
+nltk.download('punkt', download_dir='./')  # comment if downloaded once
+nltk.download('punkt_tab', download_dir='./')
+nltk.data.path.append('.')
 import numpy as np
 import yaml
 import librosa
+from models import ProsodyPredictor, TextEncoder, StyleEncoder, MelSpec
 from nltk.tokenize import word_tokenize
 from Utils.text_utils import transliterate_number
 import textwrap
+device = 'cpu'
+if torch.cuda.is_available():
+    device = 'cuda'
 _pad = "$"
 _punctuation = ';:,.!?¡¿—…"«»“” '
 textclenaer = TextCleaner()
 def alpha_num(f):
     f = re.sub(' +', ' ', f)              # delete spaces
     f = re.sub(r'[^A-Z a-z0-9 ]+', '', f)  # del non alpha num
     return f
+mel_spec = MelSpec().to(device)
 def compute_style(path):
+    x, sr = librosa.load(path, sr=24000)
+    x, _ = librosa.effects.trim(x, top_db=30)
     if sr != 24000:
+        x = librosa.resample(x, sr, 24000)
     with torch.no_grad():
+        x = torch.from_numpy(x[None, :]).to(device=device, dtype=torch.float)
+        mel_tensor = (torch.log(1e-5 + mel_spec(x)) + 4) / 4
+        #mel_tensor = preprocess(audio).to(device)
+        ref_s = style_encoder(mel_tensor)
+        ref_p = predictor_encoder(mel_tensor)  # [bs, 11, 1, 128]
+        s = torch.cat([ref_s, ref_p], dim=3)  # [bs, 11, 1, 256]
+        s = s[:, :, 0, :].transpose(1, 2)  # [1, 128, 11]
+    return s  # [1, 128, 11]
 global_phonemizer = phonemizer.backend.EspeakBackend(
     language='en-us', preserve_punctuation=True,  with_stress=True)
 args = yaml.safe_load(open(str('Utils/config.yml')))
 ASR_config = args['ASR_config']
 bert = load_plbert(args['PLBERT_dir']).eval().to(device)
 decoder = Decoder(dim_in=512,
 predictor = ProsodyPredictor(style_dim=128,
                              d_hid=512,
                              nlayers=3,  # OFFICIAL config.nlayers=5;
+                             max_dur=50).eval().to(device)
 style_encoder = StyleEncoder(dim_in=64,
                              style_dim=128,
 # params_whole = torch.load('freevc2/yl4579_styletts2.pth' map_location='cpu')
 params_whole = torch.load(str(cached_path(
+    "hf://yl4579/StyleTTS2-LibriTTS/Models/LibriTTS/epochs_2nd_00020.pth")), map_location='cpu', weights_only=True)
 params = params_whole['net']
+#params['decoder'].pop('module.generator.m_source.l_linear.weight')
+#params['decoder'].pop('module.generator.m_source.l_linear.bias')  # SourceHNSf
 def _del_prefix(d):
     # del ".module"
     params['predictor_encoder']), strict=True)
 style_encoder.load_state_dict(_del_prefix(
     params['style_encoder']), strict=True)
 def inference(text,
+              ref_s):
+    # text = transliterate_number(text, lang='en').strip()  # Transliteration only used for foreign()  # perhaps add xtra . after ? ;
     ps = global_phonemizer.phonemize([text])
     ps = word_tokenize(ps[0])
     ps = ' '.join(ps)
     tokens = textclenaer(ps)
     tokens.insert(0, 0)
     tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)
     with torch.no_grad():
+        hidden_states = text_encoder(tokens)
+        bert_dur = bert(tokens, attention_mask=torch.ones_like(tokens))
         d_en = bert_encoder(bert_dur).transpose(-1, -2)
+        aln_trg, F0_pred, N_pred = predictor(d_en=d_en, s=ref_s[:, 128:, :])
+        asr = torch.bmm(aln_trg, hidden_states)
+        asr = asr.transpose(1, 2)
+        asr = torch.cat([asr[:, :, 0:1], asr[:, :, 0:-1]], 2)
+        x = decoder(asr=asr,              # [1, 512, 201]
+                    F0_curve=F0_pred,     # [1, 1, 402] 2x time
+                    N=N_pred,             # [1, 1, 402] 2x time
+                    s=ref_s[:, :128, :])  # [1, 256, 1]
+    x = x.cpu().numpy()[0, 0, :]
+    x[-400:] = 0  # noisy pulse produced for unterminated sentences, in absence of punctuation, (not sure if same behaviour for all voices)
+    # StyleTTS2 is 24kHz -> Resample to 16kHz as is AudioGen / MMS
     if x.shape[0] > 10:
         x = audresample.resample(signal=x.astype(np.float32),
                                  original_rate=24000,
+                                 target_rate=16000)[0, :]  # audresample reshapes (64,) -> (1,64) | Volume Normalisation applies in api.py:tts_multi_sentence()
     else:
         print('\n\n\n\n\nEMPTY TTS\n\n\n\n\n\nn', x.shape)
     elif 'rom' in lang:
         lang_code = 'ron'
+    elif 'ger' in lang or 'deu' in lang or 'allem' in lang:
         lang_code = 'deu'
     elif 'alban' in lang:
         lang_code = 'sqi'
     else:
     # load VITS
+    # net_g = VitsModel.from_pretrained(f'facebook/mms-tts-{lang_code}').eval().to(device)
+    # tokenizer = VitsTokenizer.from_pretrained(f'facebook/mms-tts-{lang_code}')
+    global cached_lang_code, cached_net_g, cached_tokenizer
+    if 'cached_lang_code' not in globals() or cached_lang_code != lang_code:
+        cached_lang_code = lang_code
+        cached_net_g = VitsModel.from_pretrained(f'facebook/mms-tts-{lang_code}').eval().to(device)
+        cached_tokenizer = VitsTokenizer.from_pretrained(f'facebook/mms-tts-{lang_code}')
+    net_g = cached_net_g
+    tokenizer = cached_tokenizer
     total_audio = []
     # Split long sentences if deu to control voice switch - for other languages let text no-split
     if not isinstance(text, list):
+            # Split Very long sentences
+            text = [sub_sent+' ' for sub_sent in textwrap.wrap(text, 440, break_long_words=0)]
     for _t in text:
         _t = _t.lower()
+        # NUMBERS
+        try:
+          _t = transliterate_number(_t, lang=lang_code)
+        except NotImplementedError:
+          print('Transliterate Numbers - NotImplemented for {lang_code=}', _t,'\n____________________________________________')
+        # PRONOUNC.
         if lang_code == 'rmc-script_latin':
             x = net_g(input_ids=inputs.input_ids.to(device),
                       attention_mask=inputs.attention_mask.to(device),
+                      lang_code=lang_code,
                       )[0, :]
             # crop the 1st audio - is PREFIX text 156000 samples to chose deu voice / VitsAttention()
     x = torch.cat(total_audio).cpu().numpy()
+    # x /= np.abs(x).max() + 1e-7  ~ Volume normalisation @api.py:tts_multi_sentence() OR demo.py
     return x  # 16kHz - only resample  StyleTTS2 from 24Hkz -> 16kHz

requirements.txt CHANGED Viewed

@@ -18,4 +18,4 @@ srt
 nltk
 phonemizer
 docx
-torchaudio

 nltk
 phonemizer
 docx
+uroman

tts.py CHANGED Viewed

@@ -91,13 +91,13 @@ def command_line_args():
 def send_to_server(args):
     url = "http://192.168.88.209:5000"
     # Args
     payload = {
         'affective': args.affective,
         'voice': args.voice,
-        'soundscape': args.soundscape,
         'native': args.native,
         'text': args.text,
         'image': args.image,

 def send_to_server(args):
     url = "http://192.168.88.209:5000"
     # Args
     payload = {
         'affective': args.affective,
         'voice': args.voice,
+        'soundscape': args.soundscape if args.soundscape != '' else None,
         'native': args.native,
         'text': args.text,
         'image': args.image,