# coding:utf-8

import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils import spectral_norm
from torch.nn.utils.parametrizations import weight_norm
# from Utils.ASR.models import ASRCNN
# from Utils.JDC.model import JDCNet
from Modules.hifigan import _tile, AdainResBlk1d
import math

class MelSpec(torch.nn.Module):

    def __init__(self,
                 sample_rate=17402, # https://github.com/fakerybakery/styletts2-cli/blob/main/msinference.py = Default 16000. However 17400 vocalises better also "en_US/vctk_p274"
                 n_fft=2048,
                 win_length=1200,
                 hop_length=300,
                 n_mels=80
                 ):
        '''avoids dependency on torchaudio'''
        super().__init__()
        self.n_fft = n_fft
        self.win_length = win_length if win_length is not None else n_fft
        self.hop_length = hop_length if hop_length is not None else self.win_length // 2
        # --
        f_min = 0.0
        f_max = float(sample_rate // 2)
        all_freqs = torch.linspace(0, sample_rate // 2, n_fft//2+1)
        m_min = 2595.0 * math.log10(1.0 + (f_min / 700.0))
        m_max = 2595.0 * math.log10(1.0 + (f_max / 700.0))
        m_pts = torch.linspace(m_min, m_max, n_mels + 2)
        f_pts = 700.0 * (10 ** (m_pts / 2595.0) - 1.0)
        f_diff = f_pts[1:] - f_pts[:-1]  # (n_mels + 1)
        slopes = f_pts.unsqueeze(0) - all_freqs.unsqueeze(1)
        zero = torch.zeros(1)
        down_slopes = (-1.0 * slopes[:, :-2]) / f_diff[:-1]  # (n_freqs, n_mels)
        up_slopes = slopes[:, 2:] / f_diff[1:]  # (n_freqs, n_mels)
        fb = torch.max(zero, torch.min(down_slopes, up_slopes))
        # --
        self.register_buffer('fb', fb)
        window = torch.hann_window(self.win_length)
        self.register_buffer('window', window)

    def forward(self, x):
        spec_f = torch.stft(x,
                            self.n_fft,
                            self.hop_length,
                            self.win_length,
                            self.window,
                            center=True,
                            pad_mode="reflect",
                            normalized=False,
                            onesided=True,
                            return_complex=True)  # [bs, 1025, 56]
        mel_specgram = torch.matmul(spec_f.abs().pow(2).transpose(1, 2), self.fb).transpose(1, 2)
        return mel_specgram[:, None, :, :]  # [bs, 1, 80, time]


class LearnedDownSample(nn.Module):
    def __init__(self, dim_in):
        super().__init__()
        self.conv = spectral_norm(nn.Conv2d(dim_in, dim_in, kernel_size=(
                3, 3), stride=(2, 2), groups=dim_in, padding=1))
        
    def forward(self, x):
        return self.conv(x)


class ResBlk(nn.Module):
    def __init__(self, 
                 dim_in, dim_out):
        super().__init__()
        self.actv = nn.LeakyReLU(0.2)   # .07 also nice
        self.downsample_res = LearnedDownSample(dim_in)
        self.learned_sc = dim_in != dim_out
        self.conv1 = spectral_norm(nn.Conv2d(dim_in, dim_in, 3, 1, 1))
        self.conv2 = spectral_norm(nn.Conv2d(dim_in, dim_out, 3, 1, 1))
        if self.learned_sc:
            self.conv1x1 = spectral_norm(
                nn.Conv2d(dim_in, dim_out, 1, 1, 0, bias=False))

    def _shortcut(self, x):
        if self.learned_sc:
            x = self.conv1x1(x)
        if x.shape[3] % 2 != 0:  # [bs, 128, Freq, Time]
            x = torch.cat([x, x[:, :, :, -1:]], dim=3)
        return F.interpolate(x, scale_factor=.5, mode='nearest-exact')  # F.avg_pool2d(x, 2)

    def _residual(self, x):
        x = self.actv(x)
        x = self.conv1(x)
        x = self.downsample_res(x)
        x = self.actv(x)
        x = self.conv2(x)
        return x

    def forward(self, x):
        x = self._shortcut(x) + self._residual(x)
        return x / math.sqrt(2)  # unit variance


class StyleEncoder(nn.Module):

    #  for both acoustic & prosodic ref_s/p

    def __init__(self,
                 dim_in=64,
                 style_dim=128,
                 max_conv_dim=512):
        super().__init__()
        blocks = [spectral_norm(nn.Conv2d(1, dim_in, 3, stride=1, padding=1))]
        for _ in range(4):
            dim_out = min(dim_in * 2, 
                          max_conv_dim)
            blocks += [ResBlk(dim_in, dim_out)]
            dim_in = dim_out
        blocks += [nn.LeakyReLU(0.24),  # w/o this activation - produces no speech
                   spectral_norm(nn.Conv2d(dim_out, dim_out, 5, stride=1, padding=0)),
                   nn.LeakyReLU(0.2)  # 0.3 sounds nice
                   ]
        self.shared = nn.Sequential(*blocks)
        self.unshared = nn.Linear(dim_out, style_dim)

    def forward(self, x):
        x = self.shared(x)
        x = x.mean(3, keepdims=True)  # comment this line for time varying style vector
        x = x.transpose(1, 3)
        s = self.unshared(x)
        return s


class LinearNorm(torch.nn.Module):
    def __init__(self, in_dim, out_dim, bias=True):
        super().__init__()
        self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias)

    def forward(self, x):
        return self.linear_layer(x)


class LayerNorm(nn.Module):
    def __init__(self, channels, eps=1e-5):
        super().__init__()
        self.channels = channels
        self.eps = eps

        self.gamma = nn.Parameter(torch.ones(channels))
        self.beta = nn.Parameter(torch.zeros(channels))

    def forward(self, x):
        x = x.transpose(1, -1)
        x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
        return x.transpose(1, -1)


class TextEncoder(nn.Module):
    def __init__(self, channels, kernel_size, depth, n_symbols):
        super().__init__()
        self.embedding = nn.Embedding(n_symbols, channels)
        padding = (kernel_size - 1) // 2
        self.cnn = nn.ModuleList()
        for _ in range(depth):
            self.cnn.append(nn.Sequential(
                weight_norm(nn.Conv1d(channels, channels, kernel_size=kernel_size, padding=padding)),
                LayerNorm(channels),
                nn.LeakyReLU(0.24))
                            )
        self.lstm = nn.LSTM(channels, channels//2, 1,
                            batch_first=True, bidirectional=True)

    def forward(self, x):
        x = self.embedding(x)  # [B, T, emb]
        x = x.transpose(1, 2)
        for c in self.cnn:
            x = c(x)
        x = x.transpose(1, 2)
        x, _ = self.lstm(x)
        return x


class AdaLayerNorm(nn.Module):

    def __init__(self, style_dim, channels=None, eps=1e-5):
        super().__init__()
        self.eps = eps
        self.fc = nn.Linear(style_dim, 1024)

    def forward(self, x, s):
        h = self.fc(s)
        gamma = h[:, :, :512]
        beta = h[:, :, 512:1024]
        x = F.layer_norm(x, (512, ), eps=self.eps)
        x = (1 + gamma) * x + beta
        return x  # [1, 75, 512]


class ProsodyPredictor(nn.Module):

    def __init__(self, style_dim, d_hid, nlayers, max_dur=50):
        super().__init__()

        self.text_encoder = DurationEncoder(sty_dim=style_dim,
                                            d_model=d_hid,
                                            nlayers=nlayers)  # called outside forward
        self.lstm = nn.LSTM(d_hid + style_dim, d_hid // 2,
                            1, batch_first=True, bidirectional=True)
        self.duration_proj = LinearNorm(d_hid, max_dur)
        self.shared = nn.LSTM(d_hid + style_dim, d_hid //
                              2, 1, batch_first=True, bidirectional=True)
        self.F0 = nn.ModuleList([
            AdainResBlk1d(d_hid, d_hid, style_dim),
            AdainResBlk1d(d_hid, d_hid // 2,  style_dim, upsample=True),
            AdainResBlk1d(d_hid // 2, d_hid // 2, style_dim),
            ])
        self.N = nn.ModuleList([
            AdainResBlk1d(d_hid, d_hid, style_dim),
            AdainResBlk1d(d_hid, d_hid // 2, style_dim, upsample=True),
            AdainResBlk1d(d_hid // 2, d_hid // 2, style_dim)
            ])
        self.F0_proj = nn.Conv1d(d_hid // 2, 1, 1, 1, 0)
        self.N_proj = nn.Conv1d(d_hid // 2, 1, 1, 1, 0)

    def F0Ntrain(self, x, s):

        x, _ = self.shared(x)  # [bs, time, ch] LSTM

        x = x.transpose(1, 2)  # [bs, ch, time]

        F0 = x

        for block in self.F0:
            # print(f'LOOP {F0.shape=} {s.shape=}\n')
            # )N F0.shape=torch.Size([1, 512, 147]) s.shape=torch.Size([1, 128])
            # This is an AdainResBlk1d expects conv1d dimensions
            F0 = block(F0, s)
        F0 = self.F0_proj(F0)

        N = x

        for block in self.N:
            N = block(N, s)
        N = self.N_proj(N)

        return F0, N
    
    def forward(self, d_en=None, s=None):
        blend = self.text_encoder(d_en, s)
        x, _ = self.lstm(blend)
        dur = self.duration_proj(x)  # [bs, 150, 50]
        
        _, input_length, classifier_50 = dur.shape

        dur = dur[0, :, :]
        dur = torch.sigmoid(dur).sum(1)
        dur = dur.round().clamp(min=1).to(torch.int64)
        aln_trg = torch.zeros(1,
                              dur.sum(),
                              input_length, 
                              device=s.device)
        c_frame = 0
        for i in range(input_length):
            aln_trg[:, c_frame:c_frame + dur[i], i] = 1
            c_frame += dur[i]
        en = torch.bmm(aln_trg, blend)
        F0_pred, N_pred = self.F0Ntrain(en, s)
        return aln_trg, F0_pred, N_pred


class DurationEncoder(nn.Module):

    def __init__(self, sty_dim=128, d_model=512, nlayers=3):
        super().__init__()
        self.lstms = nn.ModuleList()
        for _ in range(nlayers):
            self.lstms.append(nn.LSTM(d_model + sty_dim,
                                      d_model // 2,
                                      num_layers=1,
                                      batch_first=True,
                                      bidirectional=True
                                      ))
            self.lstms.append(AdaLayerNorm(sty_dim, d_model))


    def forward(self, x, style):

        _, _, input_lengths = x.shape  # [bs, 512, time]

        style = _tile(style, length=x.shape[2]).transpose(1, 2)
        x = x.transpose(1, 2)

        for block in self.lstms:
            if isinstance(block, AdaLayerNorm):
                
                x = block(x, style)  # LSTM has transposed x

            else:
                x = torch.cat([x, style], axis=2)
                # LSTM

                x,_ = block(x)  # expects [bs, time, chan]  OUTPUTS [bs, time, 2*chan]  2x FROM BIDIRECTIONAL

        return torch.cat([x, style], axis=2)  # predictor.lstm()