Spaces:
				
			
			
	
			
			
					
		Running
		
			on 
			
			CPU Upgrade
	
	
	
			
			
	
	
	
	
		
		
					
		Running
		
			on 
			
			CPU Upgrade
	feat: update infer
Browse files- app.py +4 -4
- lib/infer_pack/models.py +26 -8
- lib/infer_pack/models_dml.py +1124 -0
- lib/infer_pack/onnx_inference.py +3 -1
- requirements.txt +1 -1
- vc_infer_pipeline.py +13 -1
    	
        app.py
    CHANGED
    
    | @@ -41,7 +41,7 @@ else: | |
| 41 | 
             
                f0method_mode = ["pm", "harvest", "crepe"]
         | 
| 42 | 
             
                f0method_info = "PM is fast, Harvest is good but extremely slow, and Crepe effect is good but requires GPU (Default: PM)"
         | 
| 43 |  | 
| 44 | 
            -
            def create_vc_fn( | 
| 45 | 
             
                def vc_fn(
         | 
| 46 | 
             
                    vc_audio_mode,
         | 
| 47 | 
             
                    vc_input, 
         | 
| @@ -57,7 +57,7 @@ def create_vc_fn(model_title, tgt_sr, net_g, vc, if_f0, version, file_index): | |
| 57 | 
             
                    protect,
         | 
| 58 | 
             
                ):
         | 
| 59 | 
             
                    try:
         | 
| 60 | 
            -
                        print(f" | 
| 61 | 
             
                        if vc_audio_mode == "Input path" or "Youtube" and vc_input != "":
         | 
| 62 | 
             
                            audio, sr = librosa.load(vc_input, sr=16000, mono=True)
         | 
| 63 | 
             
                        elif vc_audio_mode == "Upload audio":
         | 
| @@ -104,7 +104,7 @@ def create_vc_fn(model_title, tgt_sr, net_g, vc, if_f0, version, file_index): | |
| 104 | 
             
                            f0_file=None,
         | 
| 105 | 
             
                        )
         | 
| 106 | 
             
                        info = f"[{datetime.now().strftime('%Y-%m-%d %H:%M')}]: npy: {times[0]}, f0: {times[1]}s, infer: {times[2]}s"
         | 
| 107 | 
            -
                        print(f"{ | 
| 108 | 
             
                        return info, (tgt_sr, audio_opt)
         | 
| 109 | 
             
                    except:
         | 
| 110 | 
             
                        info = traceback.format_exc()
         | 
| @@ -159,7 +159,7 @@ def load_model(): | |
| 159 | 
             
                            net_g = net_g.float()
         | 
| 160 | 
             
                        vc = VC(tgt_sr, config)
         | 
| 161 | 
             
                        print(f"Model loaded: {character_name} / {info['feature_retrieval_library']} | ({model_version})")
         | 
| 162 | 
            -
                        models.append((character_name, model_title, model_author, model_cover, model_version, create_vc_fn( | 
| 163 | 
             
                    categories.append([category_title, category_folder, description, models])
         | 
| 164 | 
             
                return categories
         | 
| 165 |  | 
|  | |
| 41 | 
             
                f0method_mode = ["pm", "harvest", "crepe"]
         | 
| 42 | 
             
                f0method_info = "PM is fast, Harvest is good but extremely slow, and Crepe effect is good but requires GPU (Default: PM)"
         | 
| 43 |  | 
| 44 | 
            +
            def create_vc_fn(model_name, tgt_sr, net_g, vc, if_f0, version, file_index):
         | 
| 45 | 
             
                def vc_fn(
         | 
| 46 | 
             
                    vc_audio_mode,
         | 
| 47 | 
             
                    vc_input, 
         | 
|  | |
| 57 | 
             
                    protect,
         | 
| 58 | 
             
                ):
         | 
| 59 | 
             
                    try:
         | 
| 60 | 
            +
                        print(f"Converting using {model_name}...")
         | 
| 61 | 
             
                        if vc_audio_mode == "Input path" or "Youtube" and vc_input != "":
         | 
| 62 | 
             
                            audio, sr = librosa.load(vc_input, sr=16000, mono=True)
         | 
| 63 | 
             
                        elif vc_audio_mode == "Upload audio":
         | 
|  | |
| 104 | 
             
                            f0_file=None,
         | 
| 105 | 
             
                        )
         | 
| 106 | 
             
                        info = f"[{datetime.now().strftime('%Y-%m-%d %H:%M')}]: npy: {times[0]}, f0: {times[1]}s, infer: {times[2]}s"
         | 
| 107 | 
            +
                        print(f"{model_name} | {info}")
         | 
| 108 | 
             
                        return info, (tgt_sr, audio_opt)
         | 
| 109 | 
             
                    except:
         | 
| 110 | 
             
                        info = traceback.format_exc()
         | 
|  | |
| 159 | 
             
                            net_g = net_g.float()
         | 
| 160 | 
             
                        vc = VC(tgt_sr, config)
         | 
| 161 | 
             
                        print(f"Model loaded: {character_name} / {info['feature_retrieval_library']} | ({model_version})")
         | 
| 162 | 
            +
                        models.append((character_name, model_title, model_author, model_cover, model_version, create_vc_fn(model_name, tgt_sr, net_g, vc, if_f0, version, model_index)))
         | 
| 163 | 
             
                    categories.append([category_title, category_folder, description, models])
         | 
| 164 | 
             
                return categories
         | 
| 165 |  | 
    	
        lib/infer_pack/models.py
    CHANGED
    
    | @@ -631,12 +631,17 @@ class SynthesizerTrnMs256NSFsid(nn.Module): | |
| 631 | 
             
                    o = self.dec(z_slice, pitchf, g=g)
         | 
| 632 | 
             
                    return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
         | 
| 633 |  | 
| 634 | 
            -
                def infer(self, phone, phone_lengths, pitch, nsff0, sid,  | 
| 635 | 
             
                    g = self.emb_g(sid).unsqueeze(-1)
         | 
| 636 | 
             
                    m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
         | 
| 637 | 
             
                    z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
| 638 | 
             
                    z = self.flow(z_p, x_mask, g=g, reverse=True)
         | 
| 639 | 
            -
                    o = self.dec( | 
| 640 | 
             
                    return o, x_mask, (z, z_p, m_p, logs_p)
         | 
| 641 |  | 
| 642 |  | 
| @@ -742,12 +747,17 @@ class SynthesizerTrnMs768NSFsid(nn.Module): | |
| 742 | 
             
                    o = self.dec(z_slice, pitchf, g=g)
         | 
| 743 | 
             
                    return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
         | 
| 744 |  | 
| 745 | 
            -
                def infer(self, phone, phone_lengths, pitch, nsff0, sid,  | 
| 746 | 
             
                    g = self.emb_g(sid).unsqueeze(-1)
         | 
| 747 | 
             
                    m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
         | 
| 748 | 
             
                    z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
| 749 | 
             
                    z = self.flow(z_p, x_mask, g=g, reverse=True)
         | 
| 750 | 
            -
                    o = self.dec( | 
| 751 | 
             
                    return o, x_mask, (z, z_p, m_p, logs_p)
         | 
| 752 |  | 
| 753 |  | 
| @@ -844,12 +854,16 @@ class SynthesizerTrnMs256NSFsid_nono(nn.Module): | |
| 844 | 
             
                    o = self.dec(z_slice, g=g)
         | 
| 845 | 
             
                    return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
         | 
| 846 |  | 
| 847 | 
            -
                def infer(self, phone, phone_lengths, sid,  | 
| 848 | 
             
                    g = self.emb_g(sid).unsqueeze(-1)
         | 
| 849 | 
             
                    m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
         | 
| 850 | 
             
                    z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
         | 
|  | |
|  | |
|  | |
|  | |
| 851 | 
             
                    z = self.flow(z_p, x_mask, g=g, reverse=True)
         | 
| 852 | 
            -
                    o = self.dec( | 
| 853 | 
             
                    return o, x_mask, (z, z_p, m_p, logs_p)
         | 
| 854 |  | 
| 855 |  | 
| @@ -946,12 +960,16 @@ class SynthesizerTrnMs768NSFsid_nono(nn.Module): | |
| 946 | 
             
                    o = self.dec(z_slice, g=g)
         | 
| 947 | 
             
                    return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
         | 
| 948 |  | 
| 949 | 
            -
                def infer(self, phone, phone_lengths, sid,  | 
| 950 | 
             
                    g = self.emb_g(sid).unsqueeze(-1)
         | 
| 951 | 
             
                    m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
         | 
| 952 | 
             
                    z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
         | 
|  | |
|  | |
|  | |
|  | |
| 953 | 
             
                    z = self.flow(z_p, x_mask, g=g, reverse=True)
         | 
| 954 | 
            -
                    o = self.dec( | 
| 955 | 
             
                    return o, x_mask, (z, z_p, m_p, logs_p)
         | 
| 956 |  | 
| 957 |  | 
|  | |
| 631 | 
             
                    o = self.dec(z_slice, pitchf, g=g)
         | 
| 632 | 
             
                    return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
         | 
| 633 |  | 
| 634 | 
            +
                def infer(self, phone, phone_lengths, pitch, nsff0, sid, rate=None):
         | 
| 635 | 
             
                    g = self.emb_g(sid).unsqueeze(-1)
         | 
| 636 | 
             
                    m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
         | 
| 637 | 
             
                    z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
         | 
| 638 | 
            +
                    if rate:
         | 
| 639 | 
            +
                        head = int(z_p.shape[2] * rate)
         | 
| 640 | 
            +
                        z_p = z_p[:, :, -head:]
         | 
| 641 | 
            +
                        x_mask = x_mask[:, :, -head:]
         | 
| 642 | 
            +
                        nsff0 = nsff0[:, -head:]
         | 
| 643 | 
             
                    z = self.flow(z_p, x_mask, g=g, reverse=True)
         | 
| 644 | 
            +
                    o = self.dec(z * x_mask, nsff0, g=g)
         | 
| 645 | 
             
                    return o, x_mask, (z, z_p, m_p, logs_p)
         | 
| 646 |  | 
| 647 |  | 
|  | |
| 747 | 
             
                    o = self.dec(z_slice, pitchf, g=g)
         | 
| 748 | 
             
                    return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
         | 
| 749 |  | 
| 750 | 
            +
                def infer(self, phone, phone_lengths, pitch, nsff0, sid, rate=None):
         | 
| 751 | 
             
                    g = self.emb_g(sid).unsqueeze(-1)
         | 
| 752 | 
             
                    m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
         | 
| 753 | 
             
                    z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
         | 
| 754 | 
            +
                    if rate:
         | 
| 755 | 
            +
                        head = int(z_p.shape[2] * rate)
         | 
| 756 | 
            +
                        z_p = z_p[:, :, -head:]
         | 
| 757 | 
            +
                        x_mask = x_mask[:, :, -head:]
         | 
| 758 | 
            +
                        nsff0 = nsff0[:, -head:]
         | 
| 759 | 
             
                    z = self.flow(z_p, x_mask, g=g, reverse=True)
         | 
| 760 | 
            +
                    o = self.dec(z * x_mask, nsff0, g=g)
         | 
| 761 | 
             
                    return o, x_mask, (z, z_p, m_p, logs_p)
         | 
| 762 |  | 
| 763 |  | 
|  | |
| 854 | 
             
                    o = self.dec(z_slice, g=g)
         | 
| 855 | 
             
                    return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
         | 
| 856 |  | 
| 857 | 
            +
                def infer(self, phone, phone_lengths, sid, rate=None):
         | 
| 858 | 
             
                    g = self.emb_g(sid).unsqueeze(-1)
         | 
| 859 | 
             
                    m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
         | 
| 860 | 
             
                    z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
         | 
| 861 | 
            +
                    if rate:
         | 
| 862 | 
            +
                        head = int(z_p.shape[2] * rate)
         | 
| 863 | 
            +
                        z_p = z_p[:, :, -head:]
         | 
| 864 | 
            +
                        x_mask = x_mask[:, :, -head:]
         | 
| 865 | 
             
                    z = self.flow(z_p, x_mask, g=g, reverse=True)
         | 
| 866 | 
            +
                    o = self.dec(z * x_mask, g=g)
         | 
| 867 | 
             
                    return o, x_mask, (z, z_p, m_p, logs_p)
         | 
| 868 |  | 
| 869 |  | 
|  | |
| 960 | 
             
                    o = self.dec(z_slice, g=g)
         | 
| 961 | 
             
                    return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
         | 
| 962 |  | 
| 963 | 
            +
                def infer(self, phone, phone_lengths, sid, rate=None):
         | 
| 964 | 
             
                    g = self.emb_g(sid).unsqueeze(-1)
         | 
| 965 | 
             
                    m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
         | 
| 966 | 
             
                    z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
         | 
| 967 | 
            +
                    if rate:
         | 
| 968 | 
            +
                        head = int(z_p.shape[2] * rate)
         | 
| 969 | 
            +
                        z_p = z_p[:, :, -head:]
         | 
| 970 | 
            +
                        x_mask = x_mask[:, :, -head:]
         | 
| 971 | 
             
                    z = self.flow(z_p, x_mask, g=g, reverse=True)
         | 
| 972 | 
            +
                    o = self.dec(z * x_mask, g=g)
         | 
| 973 | 
             
                    return o, x_mask, (z, z_p, m_p, logs_p)
         | 
| 974 |  | 
| 975 |  | 
    	
        lib/infer_pack/models_dml.py
    ADDED
    
    | @@ -0,0 +1,1124 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import math, pdb, os
         | 
| 2 | 
            +
            from time import time as ttime
         | 
| 3 | 
            +
            import torch
         | 
| 4 | 
            +
            from torch import nn
         | 
| 5 | 
            +
            from torch.nn import functional as F
         | 
| 6 | 
            +
            from lib.infer_pack import modules
         | 
| 7 | 
            +
            from lib.infer_pack import attentions
         | 
| 8 | 
            +
            from lib.infer_pack import commons
         | 
| 9 | 
            +
            from lib.infer_pack.commons import init_weights, get_padding
         | 
| 10 | 
            +
            from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
         | 
| 11 | 
            +
            from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
         | 
| 12 | 
            +
            from lib.infer_pack.commons import init_weights
         | 
| 13 | 
            +
            import numpy as np
         | 
| 14 | 
            +
            from lib.infer_pack import commons
         | 
| 15 | 
            +
             | 
| 16 | 
            +
             | 
| 17 | 
            +
            class TextEncoder256(nn.Module):
         | 
| 18 | 
            +
                def __init__(
         | 
| 19 | 
            +
                    self,
         | 
| 20 | 
            +
                    out_channels,
         | 
| 21 | 
            +
                    hidden_channels,
         | 
| 22 | 
            +
                    filter_channels,
         | 
| 23 | 
            +
                    n_heads,
         | 
| 24 | 
            +
                    n_layers,
         | 
| 25 | 
            +
                    kernel_size,
         | 
| 26 | 
            +
                    p_dropout,
         | 
| 27 | 
            +
                    f0=True,
         | 
| 28 | 
            +
                ):
         | 
| 29 | 
            +
                    super().__init__()
         | 
| 30 | 
            +
                    self.out_channels = out_channels
         | 
| 31 | 
            +
                    self.hidden_channels = hidden_channels
         | 
| 32 | 
            +
                    self.filter_channels = filter_channels
         | 
| 33 | 
            +
                    self.n_heads = n_heads
         | 
| 34 | 
            +
                    self.n_layers = n_layers
         | 
| 35 | 
            +
                    self.kernel_size = kernel_size
         | 
| 36 | 
            +
                    self.p_dropout = p_dropout
         | 
| 37 | 
            +
                    self.emb_phone = nn.Linear(256, hidden_channels)
         | 
| 38 | 
            +
                    self.lrelu = nn.LeakyReLU(0.1, inplace=True)
         | 
| 39 | 
            +
                    if f0 == True:
         | 
| 40 | 
            +
                        self.emb_pitch = nn.Embedding(256, hidden_channels)  # pitch 256
         | 
| 41 | 
            +
                    self.encoder = attentions.Encoder(
         | 
| 42 | 
            +
                        hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
         | 
| 43 | 
            +
                    )
         | 
| 44 | 
            +
                    self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
         | 
| 45 | 
            +
             | 
| 46 | 
            +
                def forward(self, phone, pitch, lengths):
         | 
| 47 | 
            +
                    if pitch == None:
         | 
| 48 | 
            +
                        x = self.emb_phone(phone)
         | 
| 49 | 
            +
                    else:
         | 
| 50 | 
            +
                        x = self.emb_phone(phone) + self.emb_pitch(pitch)
         | 
| 51 | 
            +
                    x = x * math.sqrt(self.hidden_channels)  # [b, t, h]
         | 
| 52 | 
            +
                    x = self.lrelu(x)
         | 
| 53 | 
            +
                    x = torch.transpose(x, 1, -1)  # [b, h, t]
         | 
| 54 | 
            +
                    x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
         | 
| 55 | 
            +
                        x.dtype
         | 
| 56 | 
            +
                    )
         | 
| 57 | 
            +
                    x = self.encoder(x * x_mask, x_mask)
         | 
| 58 | 
            +
                    stats = self.proj(x) * x_mask
         | 
| 59 | 
            +
             | 
| 60 | 
            +
                    m, logs = torch.split(stats, self.out_channels, dim=1)
         | 
| 61 | 
            +
                    return m, logs, x_mask
         | 
| 62 | 
            +
             | 
| 63 | 
            +
             | 
| 64 | 
            +
            class TextEncoder768(nn.Module):
         | 
| 65 | 
            +
                def __init__(
         | 
| 66 | 
            +
                    self,
         | 
| 67 | 
            +
                    out_channels,
         | 
| 68 | 
            +
                    hidden_channels,
         | 
| 69 | 
            +
                    filter_channels,
         | 
| 70 | 
            +
                    n_heads,
         | 
| 71 | 
            +
                    n_layers,
         | 
| 72 | 
            +
                    kernel_size,
         | 
| 73 | 
            +
                    p_dropout,
         | 
| 74 | 
            +
                    f0=True,
         | 
| 75 | 
            +
                ):
         | 
| 76 | 
            +
                    super().__init__()
         | 
| 77 | 
            +
                    self.out_channels = out_channels
         | 
| 78 | 
            +
                    self.hidden_channels = hidden_channels
         | 
| 79 | 
            +
                    self.filter_channels = filter_channels
         | 
| 80 | 
            +
                    self.n_heads = n_heads
         | 
| 81 | 
            +
                    self.n_layers = n_layers
         | 
| 82 | 
            +
                    self.kernel_size = kernel_size
         | 
| 83 | 
            +
                    self.p_dropout = p_dropout
         | 
| 84 | 
            +
                    self.emb_phone = nn.Linear(768, hidden_channels)
         | 
| 85 | 
            +
                    self.lrelu = nn.LeakyReLU(0.1, inplace=True)
         | 
| 86 | 
            +
                    if f0 == True:
         | 
| 87 | 
            +
                        self.emb_pitch = nn.Embedding(256, hidden_channels)  # pitch 256
         | 
| 88 | 
            +
                    self.encoder = attentions.Encoder(
         | 
| 89 | 
            +
                        hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
         | 
| 90 | 
            +
                    )
         | 
| 91 | 
            +
                    self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
         | 
| 92 | 
            +
             | 
| 93 | 
            +
                def forward(self, phone, pitch, lengths):
         | 
| 94 | 
            +
                    if pitch == None:
         | 
| 95 | 
            +
                        x = self.emb_phone(phone)
         | 
| 96 | 
            +
                    else:
         | 
| 97 | 
            +
                        x = self.emb_phone(phone) + self.emb_pitch(pitch)
         | 
| 98 | 
            +
                    x = x * math.sqrt(self.hidden_channels)  # [b, t, h]
         | 
| 99 | 
            +
                    x = self.lrelu(x)
         | 
| 100 | 
            +
                    x = torch.transpose(x, 1, -1)  # [b, h, t]
         | 
| 101 | 
            +
                    x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
         | 
| 102 | 
            +
                        x.dtype
         | 
| 103 | 
            +
                    )
         | 
| 104 | 
            +
                    x = self.encoder(x * x_mask, x_mask)
         | 
| 105 | 
            +
                    stats = self.proj(x) * x_mask
         | 
| 106 | 
            +
             | 
| 107 | 
            +
                    m, logs = torch.split(stats, self.out_channels, dim=1)
         | 
| 108 | 
            +
                    return m, logs, x_mask
         | 
| 109 | 
            +
             | 
| 110 | 
            +
             | 
| 111 | 
            +
            class ResidualCouplingBlock(nn.Module):
         | 
| 112 | 
            +
                def __init__(
         | 
| 113 | 
            +
                    self,
         | 
| 114 | 
            +
                    channels,
         | 
| 115 | 
            +
                    hidden_channels,
         | 
| 116 | 
            +
                    kernel_size,
         | 
| 117 | 
            +
                    dilation_rate,
         | 
| 118 | 
            +
                    n_layers,
         | 
| 119 | 
            +
                    n_flows=4,
         | 
| 120 | 
            +
                    gin_channels=0,
         | 
| 121 | 
            +
                ):
         | 
| 122 | 
            +
                    super().__init__()
         | 
| 123 | 
            +
                    self.channels = channels
         | 
| 124 | 
            +
                    self.hidden_channels = hidden_channels
         | 
| 125 | 
            +
                    self.kernel_size = kernel_size
         | 
| 126 | 
            +
                    self.dilation_rate = dilation_rate
         | 
| 127 | 
            +
                    self.n_layers = n_layers
         | 
| 128 | 
            +
                    self.n_flows = n_flows
         | 
| 129 | 
            +
                    self.gin_channels = gin_channels
         | 
| 130 | 
            +
             | 
| 131 | 
            +
                    self.flows = nn.ModuleList()
         | 
| 132 | 
            +
                    for i in range(n_flows):
         | 
| 133 | 
            +
                        self.flows.append(
         | 
| 134 | 
            +
                            modules.ResidualCouplingLayer(
         | 
| 135 | 
            +
                                channels,
         | 
| 136 | 
            +
                                hidden_channels,
         | 
| 137 | 
            +
                                kernel_size,
         | 
| 138 | 
            +
                                dilation_rate,
         | 
| 139 | 
            +
                                n_layers,
         | 
| 140 | 
            +
                                gin_channels=gin_channels,
         | 
| 141 | 
            +
                                mean_only=True,
         | 
| 142 | 
            +
                            )
         | 
| 143 | 
            +
                        )
         | 
| 144 | 
            +
                        self.flows.append(modules.Flip())
         | 
| 145 | 
            +
             | 
| 146 | 
            +
                def forward(self, x, x_mask, g=None, reverse=False):
         | 
| 147 | 
            +
                    if not reverse:
         | 
| 148 | 
            +
                        for flow in self.flows:
         | 
| 149 | 
            +
                            x, _ = flow(x, x_mask, g=g, reverse=reverse)
         | 
| 150 | 
            +
                    else:
         | 
| 151 | 
            +
                        for flow in reversed(self.flows):
         | 
| 152 | 
            +
                            x = flow(x, x_mask, g=g, reverse=reverse)
         | 
| 153 | 
            +
                    return x
         | 
| 154 | 
            +
             | 
| 155 | 
            +
                def remove_weight_norm(self):
         | 
| 156 | 
            +
                    for i in range(self.n_flows):
         | 
| 157 | 
            +
                        self.flows[i * 2].remove_weight_norm()
         | 
| 158 | 
            +
             | 
| 159 | 
            +
             | 
| 160 | 
            +
            class PosteriorEncoder(nn.Module):
         | 
| 161 | 
            +
                def __init__(
         | 
| 162 | 
            +
                    self,
         | 
| 163 | 
            +
                    in_channels,
         | 
| 164 | 
            +
                    out_channels,
         | 
| 165 | 
            +
                    hidden_channels,
         | 
| 166 | 
            +
                    kernel_size,
         | 
| 167 | 
            +
                    dilation_rate,
         | 
| 168 | 
            +
                    n_layers,
         | 
| 169 | 
            +
                    gin_channels=0,
         | 
| 170 | 
            +
                ):
         | 
| 171 | 
            +
                    super().__init__()
         | 
| 172 | 
            +
                    self.in_channels = in_channels
         | 
| 173 | 
            +
                    self.out_channels = out_channels
         | 
| 174 | 
            +
                    self.hidden_channels = hidden_channels
         | 
| 175 | 
            +
                    self.kernel_size = kernel_size
         | 
| 176 | 
            +
                    self.dilation_rate = dilation_rate
         | 
| 177 | 
            +
                    self.n_layers = n_layers
         | 
| 178 | 
            +
                    self.gin_channels = gin_channels
         | 
| 179 | 
            +
             | 
| 180 | 
            +
                    self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
         | 
| 181 | 
            +
                    self.enc = modules.WN(
         | 
| 182 | 
            +
                        hidden_channels,
         | 
| 183 | 
            +
                        kernel_size,
         | 
| 184 | 
            +
                        dilation_rate,
         | 
| 185 | 
            +
                        n_layers,
         | 
| 186 | 
            +
                        gin_channels=gin_channels,
         | 
| 187 | 
            +
                    )
         | 
| 188 | 
            +
                    self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
         | 
| 189 | 
            +
             | 
| 190 | 
            +
                def forward(self, x, x_lengths, g=None):
         | 
| 191 | 
            +
                    x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
         | 
| 192 | 
            +
                        x.dtype
         | 
| 193 | 
            +
                    )
         | 
| 194 | 
            +
                    x = self.pre(x) * x_mask
         | 
| 195 | 
            +
                    x = self.enc(x, x_mask, g=g)
         | 
| 196 | 
            +
                    stats = self.proj(x) * x_mask
         | 
| 197 | 
            +
                    m, logs = torch.split(stats, self.out_channels, dim=1)
         | 
| 198 | 
            +
                    z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
         | 
| 199 | 
            +
                    return z, m, logs, x_mask
         | 
| 200 | 
            +
             | 
| 201 | 
            +
                def remove_weight_norm(self):
         | 
| 202 | 
            +
                    self.enc.remove_weight_norm()
         | 
| 203 | 
            +
             | 
| 204 | 
            +
             | 
| 205 | 
            +
            class Generator(torch.nn.Module):
         | 
| 206 | 
            +
                def __init__(
         | 
| 207 | 
            +
                    self,
         | 
| 208 | 
            +
                    initial_channel,
         | 
| 209 | 
            +
                    resblock,
         | 
| 210 | 
            +
                    resblock_kernel_sizes,
         | 
| 211 | 
            +
                    resblock_dilation_sizes,
         | 
| 212 | 
            +
                    upsample_rates,
         | 
| 213 | 
            +
                    upsample_initial_channel,
         | 
| 214 | 
            +
                    upsample_kernel_sizes,
         | 
| 215 | 
            +
                    gin_channels=0,
         | 
| 216 | 
            +
                ):
         | 
| 217 | 
            +
                    super(Generator, self).__init__()
         | 
| 218 | 
            +
                    self.num_kernels = len(resblock_kernel_sizes)
         | 
| 219 | 
            +
                    self.num_upsamples = len(upsample_rates)
         | 
| 220 | 
            +
                    self.conv_pre = Conv1d(
         | 
| 221 | 
            +
                        initial_channel, upsample_initial_channel, 7, 1, padding=3
         | 
| 222 | 
            +
                    )
         | 
| 223 | 
            +
                    resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
         | 
| 224 | 
            +
             | 
| 225 | 
            +
                    self.ups = nn.ModuleList()
         | 
| 226 | 
            +
                    for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
         | 
| 227 | 
            +
                        self.ups.append(
         | 
| 228 | 
            +
                            weight_norm(
         | 
| 229 | 
            +
                                ConvTranspose1d(
         | 
| 230 | 
            +
                                    upsample_initial_channel // (2**i),
         | 
| 231 | 
            +
                                    upsample_initial_channel // (2 ** (i + 1)),
         | 
| 232 | 
            +
                                    k,
         | 
| 233 | 
            +
                                    u,
         | 
| 234 | 
            +
                                    padding=(k - u) // 2,
         | 
| 235 | 
            +
                                )
         | 
| 236 | 
            +
                            )
         | 
| 237 | 
            +
                        )
         | 
| 238 | 
            +
             | 
| 239 | 
            +
                    self.resblocks = nn.ModuleList()
         | 
| 240 | 
            +
                    for i in range(len(self.ups)):
         | 
| 241 | 
            +
                        ch = upsample_initial_channel // (2 ** (i + 1))
         | 
| 242 | 
            +
                        for j, (k, d) in enumerate(
         | 
| 243 | 
            +
                            zip(resblock_kernel_sizes, resblock_dilation_sizes)
         | 
| 244 | 
            +
                        ):
         | 
| 245 | 
            +
                            self.resblocks.append(resblock(ch, k, d))
         | 
| 246 | 
            +
             | 
| 247 | 
            +
                    self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
         | 
| 248 | 
            +
                    self.ups.apply(init_weights)
         | 
| 249 | 
            +
             | 
| 250 | 
            +
                    if gin_channels != 0:
         | 
| 251 | 
            +
                        self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
         | 
| 252 | 
            +
             | 
| 253 | 
            +
                def forward(self, x, g=None):
         | 
| 254 | 
            +
                    x = self.conv_pre(x)
         | 
| 255 | 
            +
                    if g is not None:
         | 
| 256 | 
            +
                        x = x + self.cond(g)
         | 
| 257 | 
            +
             | 
| 258 | 
            +
                    for i in range(self.num_upsamples):
         | 
| 259 | 
            +
                        x = F.leaky_relu(x, modules.LRELU_SLOPE)
         | 
| 260 | 
            +
                        x = self.ups[i](x)
         | 
| 261 | 
            +
                        xs = None
         | 
| 262 | 
            +
                        for j in range(self.num_kernels):
         | 
| 263 | 
            +
                            if xs is None:
         | 
| 264 | 
            +
                                xs = self.resblocks[i * self.num_kernels + j](x)
         | 
| 265 | 
            +
                            else:
         | 
| 266 | 
            +
                                xs += self.resblocks[i * self.num_kernels + j](x)
         | 
| 267 | 
            +
                        x = xs / self.num_kernels
         | 
| 268 | 
            +
                    x = F.leaky_relu(x)
         | 
| 269 | 
            +
                    x = self.conv_post(x)
         | 
| 270 | 
            +
                    x = torch.tanh(x)
         | 
| 271 | 
            +
             | 
| 272 | 
            +
                    return x
         | 
| 273 | 
            +
             | 
| 274 | 
            +
                def remove_weight_norm(self):
         | 
| 275 | 
            +
                    for l in self.ups:
         | 
| 276 | 
            +
                        remove_weight_norm(l)
         | 
| 277 | 
            +
                    for l in self.resblocks:
         | 
| 278 | 
            +
                        l.remove_weight_norm()
         | 
| 279 | 
            +
             | 
| 280 | 
            +
             | 
| 281 | 
            +
            class SineGen(torch.nn.Module):
         | 
| 282 | 
            +
                """Definition of sine generator
         | 
| 283 | 
            +
                SineGen(samp_rate, harmonic_num = 0,
         | 
| 284 | 
            +
                        sine_amp = 0.1, noise_std = 0.003,
         | 
| 285 | 
            +
                        voiced_threshold = 0,
         | 
| 286 | 
            +
                        flag_for_pulse=False)
         | 
| 287 | 
            +
                samp_rate: sampling rate in Hz
         | 
| 288 | 
            +
                harmonic_num: number of harmonic overtones (default 0)
         | 
| 289 | 
            +
                sine_amp: amplitude of sine-wavefrom (default 0.1)
         | 
| 290 | 
            +
                noise_std: std of Gaussian noise (default 0.003)
         | 
| 291 | 
            +
                voiced_thoreshold: F0 threshold for U/V classification (default 0)
         | 
| 292 | 
            +
                flag_for_pulse: this SinGen is used inside PulseGen (default False)
         | 
| 293 | 
            +
                Note: when flag_for_pulse is True, the first time step of a voiced
         | 
| 294 | 
            +
                    segment is always sin(np.pi) or cos(0)
         | 
| 295 | 
            +
                """
         | 
| 296 | 
            +
             | 
| 297 | 
            +
                def __init__(
         | 
| 298 | 
            +
                    self,
         | 
| 299 | 
            +
                    samp_rate,
         | 
| 300 | 
            +
                    harmonic_num=0,
         | 
| 301 | 
            +
                    sine_amp=0.1,
         | 
| 302 | 
            +
                    noise_std=0.003,
         | 
| 303 | 
            +
                    voiced_threshold=0,
         | 
| 304 | 
            +
                    flag_for_pulse=False,
         | 
| 305 | 
            +
                ):
         | 
| 306 | 
            +
                    super(SineGen, self).__init__()
         | 
| 307 | 
            +
                    self.sine_amp = sine_amp
         | 
| 308 | 
            +
                    self.noise_std = noise_std
         | 
| 309 | 
            +
                    self.harmonic_num = harmonic_num
         | 
| 310 | 
            +
                    self.dim = self.harmonic_num + 1
         | 
| 311 | 
            +
                    self.sampling_rate = samp_rate
         | 
| 312 | 
            +
                    self.voiced_threshold = voiced_threshold
         | 
| 313 | 
            +
             | 
| 314 | 
            +
                def _f02uv(self, f0):
         | 
| 315 | 
            +
                    # generate uv signal
         | 
| 316 | 
            +
                    uv = torch.ones_like(f0)
         | 
| 317 | 
            +
                    uv = uv * (f0 > self.voiced_threshold)
         | 
| 318 | 
            +
                    return uv.float()
         | 
| 319 | 
            +
             | 
| 320 | 
            +
                def forward(self, f0, upp):
         | 
| 321 | 
            +
                    """sine_tensor, uv = forward(f0)
         | 
| 322 | 
            +
                    input F0: tensor(batchsize=1, length, dim=1)
         | 
| 323 | 
            +
                              f0 for unvoiced steps should be 0
         | 
| 324 | 
            +
                    output sine_tensor: tensor(batchsize=1, length, dim)
         | 
| 325 | 
            +
                    output uv: tensor(batchsize=1, length, 1)
         | 
| 326 | 
            +
                    """
         | 
| 327 | 
            +
                    with torch.no_grad():
         | 
| 328 | 
            +
                        f0 = f0[:, None].transpose(1, 2)
         | 
| 329 | 
            +
                        f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
         | 
| 330 | 
            +
                        # fundamental component
         | 
| 331 | 
            +
                        f0_buf[:, :, 0] = f0[:, :, 0]
         | 
| 332 | 
            +
                        for idx in np.arange(self.harmonic_num):
         | 
| 333 | 
            +
                            f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (
         | 
| 334 | 
            +
                                idx + 2
         | 
| 335 | 
            +
                            )  # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
         | 
| 336 | 
            +
                        rad_values = (f0_buf / self.sampling_rate) % 1  ###%1意味着n_har的乘积无法后处理优化
         | 
| 337 | 
            +
                        rand_ini = torch.rand(
         | 
| 338 | 
            +
                            f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device
         | 
| 339 | 
            +
                        )
         | 
| 340 | 
            +
                        rand_ini[:, 0] = 0
         | 
| 341 | 
            +
                        rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
         | 
| 342 | 
            +
                        tmp_over_one = torch.cumsum(rad_values, 1)  # % 1  #####%1意味着后面的cumsum无法再优化
         | 
| 343 | 
            +
                        tmp_over_one *= upp
         | 
| 344 | 
            +
                        tmp_over_one = F.interpolate(
         | 
| 345 | 
            +
                            tmp_over_one.transpose(2, 1),
         | 
| 346 | 
            +
                            scale_factor=upp,
         | 
| 347 | 
            +
                            mode="linear",
         | 
| 348 | 
            +
                            align_corners=True,
         | 
| 349 | 
            +
                        ).transpose(2, 1)
         | 
| 350 | 
            +
                        rad_values = F.interpolate(
         | 
| 351 | 
            +
                            rad_values.transpose(2, 1), scale_factor=upp, mode="nearest"
         | 
| 352 | 
            +
                        ).transpose(
         | 
| 353 | 
            +
                            2, 1
         | 
| 354 | 
            +
                        )  #######
         | 
| 355 | 
            +
                        tmp_over_one %= 1
         | 
| 356 | 
            +
                        tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
         | 
| 357 | 
            +
                        cumsum_shift = torch.zeros_like(rad_values)
         | 
| 358 | 
            +
                        cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
         | 
| 359 | 
            +
                        sine_waves = torch.sin(
         | 
| 360 | 
            +
                            torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi
         | 
| 361 | 
            +
                        )
         | 
| 362 | 
            +
                        sine_waves = sine_waves * self.sine_amp
         | 
| 363 | 
            +
                        uv = self._f02uv(f0)
         | 
| 364 | 
            +
                        uv = F.interpolate(
         | 
| 365 | 
            +
                            uv.transpose(2, 1), scale_factor=upp, mode="nearest"
         | 
| 366 | 
            +
                        ).transpose(2, 1)
         | 
| 367 | 
            +
                        noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
         | 
| 368 | 
            +
                        noise = noise_amp * torch.randn_like(sine_waves)
         | 
| 369 | 
            +
                        sine_waves = sine_waves * uv + noise
         | 
| 370 | 
            +
                    return sine_waves, uv, noise
         | 
| 371 | 
            +
             | 
| 372 | 
            +
             | 
| 373 | 
            +
            class SourceModuleHnNSF(torch.nn.Module):
         | 
| 374 | 
            +
                """SourceModule for hn-nsf
         | 
| 375 | 
            +
                SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
         | 
| 376 | 
            +
                             add_noise_std=0.003, voiced_threshod=0)
         | 
| 377 | 
            +
                sampling_rate: sampling_rate in Hz
         | 
| 378 | 
            +
                harmonic_num: number of harmonic above F0 (default: 0)
         | 
| 379 | 
            +
                sine_amp: amplitude of sine source signal (default: 0.1)
         | 
| 380 | 
            +
                add_noise_std: std of additive Gaussian noise (default: 0.003)
         | 
| 381 | 
            +
                    note that amplitude of noise in unvoiced is decided
         | 
| 382 | 
            +
                    by sine_amp
         | 
| 383 | 
            +
                voiced_threshold: threhold to set U/V given F0 (default: 0)
         | 
| 384 | 
            +
                Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
         | 
| 385 | 
            +
                F0_sampled (batchsize, length, 1)
         | 
| 386 | 
            +
                Sine_source (batchsize, length, 1)
         | 
| 387 | 
            +
                noise_source (batchsize, length 1)
         | 
| 388 | 
            +
                uv (batchsize, length, 1)
         | 
| 389 | 
            +
                """
         | 
| 390 | 
            +
             | 
| 391 | 
            +
                def __init__(
         | 
| 392 | 
            +
                    self,
         | 
| 393 | 
            +
                    sampling_rate,
         | 
| 394 | 
            +
                    harmonic_num=0,
         | 
| 395 | 
            +
                    sine_amp=0.1,
         | 
| 396 | 
            +
                    add_noise_std=0.003,
         | 
| 397 | 
            +
                    voiced_threshod=0,
         | 
| 398 | 
            +
                    is_half=True,
         | 
| 399 | 
            +
                ):
         | 
| 400 | 
            +
                    super(SourceModuleHnNSF, self).__init__()
         | 
| 401 | 
            +
             | 
| 402 | 
            +
                    self.sine_amp = sine_amp
         | 
| 403 | 
            +
                    self.noise_std = add_noise_std
         | 
| 404 | 
            +
                    self.is_half = is_half
         | 
| 405 | 
            +
                    # to produce sine waveforms
         | 
| 406 | 
            +
                    self.l_sin_gen = SineGen(
         | 
| 407 | 
            +
                        sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod
         | 
| 408 | 
            +
                    )
         | 
| 409 | 
            +
             | 
| 410 | 
            +
                    # to merge source harmonics into a single excitation
         | 
| 411 | 
            +
                    self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
         | 
| 412 | 
            +
                    self.l_tanh = torch.nn.Tanh()
         | 
| 413 | 
            +
             | 
| 414 | 
            +
                def forward(self, x, upp=None):
         | 
| 415 | 
            +
                    sine_wavs, uv, _ = self.l_sin_gen(x, upp)
         | 
| 416 | 
            +
                    if self.is_half:
         | 
| 417 | 
            +
                        sine_wavs = sine_wavs.half()
         | 
| 418 | 
            +
                    sine_merge = self.l_tanh(self.l_linear(sine_wavs))
         | 
| 419 | 
            +
                    return sine_merge, None, None  # noise, uv
         | 
| 420 | 
            +
             | 
| 421 | 
            +
             | 
| 422 | 
            +
            class GeneratorNSF(torch.nn.Module):
         | 
| 423 | 
            +
                def __init__(
         | 
| 424 | 
            +
                    self,
         | 
| 425 | 
            +
                    initial_channel,
         | 
| 426 | 
            +
                    resblock,
         | 
| 427 | 
            +
                    resblock_kernel_sizes,
         | 
| 428 | 
            +
                    resblock_dilation_sizes,
         | 
| 429 | 
            +
                    upsample_rates,
         | 
| 430 | 
            +
                    upsample_initial_channel,
         | 
| 431 | 
            +
                    upsample_kernel_sizes,
         | 
| 432 | 
            +
                    gin_channels,
         | 
| 433 | 
            +
                    sr,
         | 
| 434 | 
            +
                    is_half=False,
         | 
| 435 | 
            +
                ):
         | 
| 436 | 
            +
                    super(GeneratorNSF, self).__init__()
         | 
| 437 | 
            +
                    self.num_kernels = len(resblock_kernel_sizes)
         | 
| 438 | 
            +
                    self.num_upsamples = len(upsample_rates)
         | 
| 439 | 
            +
             | 
| 440 | 
            +
                    self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates))
         | 
| 441 | 
            +
                    self.m_source = SourceModuleHnNSF(
         | 
| 442 | 
            +
                        sampling_rate=sr, harmonic_num=0, is_half=is_half
         | 
| 443 | 
            +
                    )
         | 
| 444 | 
            +
                    self.noise_convs = nn.ModuleList()
         | 
| 445 | 
            +
                    self.conv_pre = Conv1d(
         | 
| 446 | 
            +
                        initial_channel, upsample_initial_channel, 7, 1, padding=3
         | 
| 447 | 
            +
                    )
         | 
| 448 | 
            +
                    resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
         | 
| 449 | 
            +
             | 
| 450 | 
            +
                    self.ups = nn.ModuleList()
         | 
| 451 | 
            +
                    for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
         | 
| 452 | 
            +
                        c_cur = upsample_initial_channel // (2 ** (i + 1))
         | 
| 453 | 
            +
                        self.ups.append(
         | 
| 454 | 
            +
                            weight_norm(
         | 
| 455 | 
            +
                                ConvTranspose1d(
         | 
| 456 | 
            +
                                    upsample_initial_channel // (2**i),
         | 
| 457 | 
            +
                                    upsample_initial_channel // (2 ** (i + 1)),
         | 
| 458 | 
            +
                                    k,
         | 
| 459 | 
            +
                                    u,
         | 
| 460 | 
            +
                                    padding=(k - u) // 2,
         | 
| 461 | 
            +
                                )
         | 
| 462 | 
            +
                            )
         | 
| 463 | 
            +
                        )
         | 
| 464 | 
            +
                        if i + 1 < len(upsample_rates):
         | 
| 465 | 
            +
                            stride_f0 = np.prod(upsample_rates[i + 1 :])
         | 
| 466 | 
            +
                            self.noise_convs.append(
         | 
| 467 | 
            +
                                Conv1d(
         | 
| 468 | 
            +
                                    1,
         | 
| 469 | 
            +
                                    c_cur,
         | 
| 470 | 
            +
                                    kernel_size=stride_f0 * 2,
         | 
| 471 | 
            +
                                    stride=stride_f0,
         | 
| 472 | 
            +
                                    padding=stride_f0 // 2,
         | 
| 473 | 
            +
                                )
         | 
| 474 | 
            +
                            )
         | 
| 475 | 
            +
                        else:
         | 
| 476 | 
            +
                            self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
         | 
| 477 | 
            +
             | 
| 478 | 
            +
                    self.resblocks = nn.ModuleList()
         | 
| 479 | 
            +
                    for i in range(len(self.ups)):
         | 
| 480 | 
            +
                        ch = upsample_initial_channel // (2 ** (i + 1))
         | 
| 481 | 
            +
                        for j, (k, d) in enumerate(
         | 
| 482 | 
            +
                            zip(resblock_kernel_sizes, resblock_dilation_sizes)
         | 
| 483 | 
            +
                        ):
         | 
| 484 | 
            +
                            self.resblocks.append(resblock(ch, k, d))
         | 
| 485 | 
            +
             | 
| 486 | 
            +
                    self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
         | 
| 487 | 
            +
                    self.ups.apply(init_weights)
         | 
| 488 | 
            +
             | 
| 489 | 
            +
                    if gin_channels != 0:
         | 
| 490 | 
            +
                        self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
         | 
| 491 | 
            +
             | 
| 492 | 
            +
                    self.upp = np.prod(upsample_rates)
         | 
| 493 | 
            +
             | 
| 494 | 
            +
                def forward(self, x, f0, g=None):
         | 
| 495 | 
            +
                    har_source, noi_source, uv = self.m_source(f0, self.upp)
         | 
| 496 | 
            +
                    har_source = har_source.transpose(1, 2)
         | 
| 497 | 
            +
                    x = self.conv_pre(x)
         | 
| 498 | 
            +
                    if g is not None:
         | 
| 499 | 
            +
                        x = x + self.cond(g)
         | 
| 500 | 
            +
             | 
| 501 | 
            +
                    for i in range(self.num_upsamples):
         | 
| 502 | 
            +
                        x = F.leaky_relu(x, modules.LRELU_SLOPE)
         | 
| 503 | 
            +
                        x = self.ups[i](x)
         | 
| 504 | 
            +
                        x_source = self.noise_convs[i](har_source)
         | 
| 505 | 
            +
                        x = x + x_source
         | 
| 506 | 
            +
                        xs = None
         | 
| 507 | 
            +
                        for j in range(self.num_kernels):
         | 
| 508 | 
            +
                            if xs is None:
         | 
| 509 | 
            +
                                xs = self.resblocks[i * self.num_kernels + j](x)
         | 
| 510 | 
            +
                            else:
         | 
| 511 | 
            +
                                xs += self.resblocks[i * self.num_kernels + j](x)
         | 
| 512 | 
            +
                        x = xs / self.num_kernels
         | 
| 513 | 
            +
                    x = F.leaky_relu(x)
         | 
| 514 | 
            +
                    x = self.conv_post(x)
         | 
| 515 | 
            +
                    x = torch.tanh(x)
         | 
| 516 | 
            +
                    return x
         | 
| 517 | 
            +
             | 
| 518 | 
            +
                def remove_weight_norm(self):
         | 
| 519 | 
            +
                    for l in self.ups:
         | 
| 520 | 
            +
                        remove_weight_norm(l)
         | 
| 521 | 
            +
                    for l in self.resblocks:
         | 
| 522 | 
            +
                        l.remove_weight_norm()
         | 
| 523 | 
            +
             | 
| 524 | 
            +
             | 
| 525 | 
            +
            sr2sr = {
         | 
| 526 | 
            +
                "32k": 32000,
         | 
| 527 | 
            +
                "40k": 40000,
         | 
| 528 | 
            +
                "48k": 48000,
         | 
| 529 | 
            +
            }
         | 
| 530 | 
            +
             | 
| 531 | 
            +
             | 
| 532 | 
            +
            class SynthesizerTrnMs256NSFsid(nn.Module):
         | 
| 533 | 
            +
                def __init__(
         | 
| 534 | 
            +
                    self,
         | 
| 535 | 
            +
                    spec_channels,
         | 
| 536 | 
            +
                    segment_size,
         | 
| 537 | 
            +
                    inter_channels,
         | 
| 538 | 
            +
                    hidden_channels,
         | 
| 539 | 
            +
                    filter_channels,
         | 
| 540 | 
            +
                    n_heads,
         | 
| 541 | 
            +
                    n_layers,
         | 
| 542 | 
            +
                    kernel_size,
         | 
| 543 | 
            +
                    p_dropout,
         | 
| 544 | 
            +
                    resblock,
         | 
| 545 | 
            +
                    resblock_kernel_sizes,
         | 
| 546 | 
            +
                    resblock_dilation_sizes,
         | 
| 547 | 
            +
                    upsample_rates,
         | 
| 548 | 
            +
                    upsample_initial_channel,
         | 
| 549 | 
            +
                    upsample_kernel_sizes,
         | 
| 550 | 
            +
                    spk_embed_dim,
         | 
| 551 | 
            +
                    gin_channels,
         | 
| 552 | 
            +
                    sr,
         | 
| 553 | 
            +
                    **kwargs
         | 
| 554 | 
            +
                ):
         | 
| 555 | 
            +
                    super().__init__()
         | 
| 556 | 
            +
                    if type(sr) == type("strr"):
         | 
| 557 | 
            +
                        sr = sr2sr[sr]
         | 
| 558 | 
            +
                    self.spec_channels = spec_channels
         | 
| 559 | 
            +
                    self.inter_channels = inter_channels
         | 
| 560 | 
            +
                    self.hidden_channels = hidden_channels
         | 
| 561 | 
            +
                    self.filter_channels = filter_channels
         | 
| 562 | 
            +
                    self.n_heads = n_heads
         | 
| 563 | 
            +
                    self.n_layers = n_layers
         | 
| 564 | 
            +
                    self.kernel_size = kernel_size
         | 
| 565 | 
            +
                    self.p_dropout = p_dropout
         | 
| 566 | 
            +
                    self.resblock = resblock
         | 
| 567 | 
            +
                    self.resblock_kernel_sizes = resblock_kernel_sizes
         | 
| 568 | 
            +
                    self.resblock_dilation_sizes = resblock_dilation_sizes
         | 
| 569 | 
            +
                    self.upsample_rates = upsample_rates
         | 
| 570 | 
            +
                    self.upsample_initial_channel = upsample_initial_channel
         | 
| 571 | 
            +
                    self.upsample_kernel_sizes = upsample_kernel_sizes
         | 
| 572 | 
            +
                    self.segment_size = segment_size
         | 
| 573 | 
            +
                    self.gin_channels = gin_channels
         | 
| 574 | 
            +
                    # self.hop_length = hop_length#
         | 
| 575 | 
            +
                    self.spk_embed_dim = spk_embed_dim
         | 
| 576 | 
            +
                    self.enc_p = TextEncoder256(
         | 
| 577 | 
            +
                        inter_channels,
         | 
| 578 | 
            +
                        hidden_channels,
         | 
| 579 | 
            +
                        filter_channels,
         | 
| 580 | 
            +
                        n_heads,
         | 
| 581 | 
            +
                        n_layers,
         | 
| 582 | 
            +
                        kernel_size,
         | 
| 583 | 
            +
                        p_dropout,
         | 
| 584 | 
            +
                    )
         | 
| 585 | 
            +
                    self.dec = GeneratorNSF(
         | 
| 586 | 
            +
                        inter_channels,
         | 
| 587 | 
            +
                        resblock,
         | 
| 588 | 
            +
                        resblock_kernel_sizes,
         | 
| 589 | 
            +
                        resblock_dilation_sizes,
         | 
| 590 | 
            +
                        upsample_rates,
         | 
| 591 | 
            +
                        upsample_initial_channel,
         | 
| 592 | 
            +
                        upsample_kernel_sizes,
         | 
| 593 | 
            +
                        gin_channels=gin_channels,
         | 
| 594 | 
            +
                        sr=sr,
         | 
| 595 | 
            +
                        is_half=kwargs["is_half"],
         | 
| 596 | 
            +
                    )
         | 
| 597 | 
            +
                    self.enc_q = PosteriorEncoder(
         | 
| 598 | 
            +
                        spec_channels,
         | 
| 599 | 
            +
                        inter_channels,
         | 
| 600 | 
            +
                        hidden_channels,
         | 
| 601 | 
            +
                        5,
         | 
| 602 | 
            +
                        1,
         | 
| 603 | 
            +
                        16,
         | 
| 604 | 
            +
                        gin_channels=gin_channels,
         | 
| 605 | 
            +
                    )
         | 
| 606 | 
            +
                    self.flow = ResidualCouplingBlock(
         | 
| 607 | 
            +
                        inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
         | 
| 608 | 
            +
                    )
         | 
| 609 | 
            +
                    self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
         | 
| 610 | 
            +
                    print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
         | 
| 611 | 
            +
             | 
| 612 | 
            +
                def remove_weight_norm(self):
         | 
| 613 | 
            +
                    self.dec.remove_weight_norm()
         | 
| 614 | 
            +
                    self.flow.remove_weight_norm()
         | 
| 615 | 
            +
                    self.enc_q.remove_weight_norm()
         | 
| 616 | 
            +
             | 
| 617 | 
            +
                def forward(
         | 
| 618 | 
            +
                    self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds
         | 
| 619 | 
            +
                ):  # 这里ds是id,[bs,1]
         | 
| 620 | 
            +
                    # print(1,pitch.shape)#[bs,t]
         | 
| 621 | 
            +
                    g = self.emb_g(ds).unsqueeze(-1)  # [b, 256, 1]##1是t,广播的
         | 
| 622 | 
            +
                    m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
         | 
| 623 | 
            +
                    z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
         | 
| 624 | 
            +
                    z_p = self.flow(z, y_mask, g=g)
         | 
| 625 | 
            +
                    z_slice, ids_slice = commons.rand_slice_segments(
         | 
| 626 | 
            +
                        z, y_lengths, self.segment_size
         | 
| 627 | 
            +
                    )
         | 
| 628 | 
            +
                    # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length)
         | 
| 629 | 
            +
                    pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size)
         | 
| 630 | 
            +
                    # print(-2,pitchf.shape,z_slice.shape)
         | 
| 631 | 
            +
                    o = self.dec(z_slice, pitchf, g=g)
         | 
| 632 | 
            +
                    return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
         | 
| 633 | 
            +
             | 
| 634 | 
            +
                def infer(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None):
         | 
| 635 | 
            +
                    g = self.emb_g(sid).unsqueeze(-1)
         | 
| 636 | 
            +
                    m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
         | 
| 637 | 
            +
                    z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
         | 
| 638 | 
            +
                    z = self.flow(z_p, x_mask, g=g, reverse=True)
         | 
| 639 | 
            +
                    o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)
         | 
| 640 | 
            +
                    return o, x_mask, (z, z_p, m_p, logs_p)
         | 
| 641 | 
            +
             | 
| 642 | 
            +
             | 
| 643 | 
            +
            class SynthesizerTrnMs768NSFsid(nn.Module):
         | 
| 644 | 
            +
                def __init__(
         | 
| 645 | 
            +
                    self,
         | 
| 646 | 
            +
                    spec_channels,
         | 
| 647 | 
            +
                    segment_size,
         | 
| 648 | 
            +
                    inter_channels,
         | 
| 649 | 
            +
                    hidden_channels,
         | 
| 650 | 
            +
                    filter_channels,
         | 
| 651 | 
            +
                    n_heads,
         | 
| 652 | 
            +
                    n_layers,
         | 
| 653 | 
            +
                    kernel_size,
         | 
| 654 | 
            +
                    p_dropout,
         | 
| 655 | 
            +
                    resblock,
         | 
| 656 | 
            +
                    resblock_kernel_sizes,
         | 
| 657 | 
            +
                    resblock_dilation_sizes,
         | 
| 658 | 
            +
                    upsample_rates,
         | 
| 659 | 
            +
                    upsample_initial_channel,
         | 
| 660 | 
            +
                    upsample_kernel_sizes,
         | 
| 661 | 
            +
                    spk_embed_dim,
         | 
| 662 | 
            +
                    gin_channels,
         | 
| 663 | 
            +
                    sr,
         | 
| 664 | 
            +
                    **kwargs
         | 
| 665 | 
            +
                ):
         | 
| 666 | 
            +
                    super().__init__()
         | 
| 667 | 
            +
                    if type(sr) == type("strr"):
         | 
| 668 | 
            +
                        sr = sr2sr[sr]
         | 
| 669 | 
            +
                    self.spec_channels = spec_channels
         | 
| 670 | 
            +
                    self.inter_channels = inter_channels
         | 
| 671 | 
            +
                    self.hidden_channels = hidden_channels
         | 
| 672 | 
            +
                    self.filter_channels = filter_channels
         | 
| 673 | 
            +
                    self.n_heads = n_heads
         | 
| 674 | 
            +
                    self.n_layers = n_layers
         | 
| 675 | 
            +
                    self.kernel_size = kernel_size
         | 
| 676 | 
            +
                    self.p_dropout = p_dropout
         | 
| 677 | 
            +
                    self.resblock = resblock
         | 
| 678 | 
            +
                    self.resblock_kernel_sizes = resblock_kernel_sizes
         | 
| 679 | 
            +
                    self.resblock_dilation_sizes = resblock_dilation_sizes
         | 
| 680 | 
            +
                    self.upsample_rates = upsample_rates
         | 
| 681 | 
            +
                    self.upsample_initial_channel = upsample_initial_channel
         | 
| 682 | 
            +
                    self.upsample_kernel_sizes = upsample_kernel_sizes
         | 
| 683 | 
            +
                    self.segment_size = segment_size
         | 
| 684 | 
            +
                    self.gin_channels = gin_channels
         | 
| 685 | 
            +
                    # self.hop_length = hop_length#
         | 
| 686 | 
            +
                    self.spk_embed_dim = spk_embed_dim
         | 
| 687 | 
            +
                    self.enc_p = TextEncoder768(
         | 
| 688 | 
            +
                        inter_channels,
         | 
| 689 | 
            +
                        hidden_channels,
         | 
| 690 | 
            +
                        filter_channels,
         | 
| 691 | 
            +
                        n_heads,
         | 
| 692 | 
            +
                        n_layers,
         | 
| 693 | 
            +
                        kernel_size,
         | 
| 694 | 
            +
                        p_dropout,
         | 
| 695 | 
            +
                    )
         | 
| 696 | 
            +
                    self.dec = GeneratorNSF(
         | 
| 697 | 
            +
                        inter_channels,
         | 
| 698 | 
            +
                        resblock,
         | 
| 699 | 
            +
                        resblock_kernel_sizes,
         | 
| 700 | 
            +
                        resblock_dilation_sizes,
         | 
| 701 | 
            +
                        upsample_rates,
         | 
| 702 | 
            +
                        upsample_initial_channel,
         | 
| 703 | 
            +
                        upsample_kernel_sizes,
         | 
| 704 | 
            +
                        gin_channels=gin_channels,
         | 
| 705 | 
            +
                        sr=sr,
         | 
| 706 | 
            +
                        is_half=kwargs["is_half"],
         | 
| 707 | 
            +
                    )
         | 
| 708 | 
            +
                    self.enc_q = PosteriorEncoder(
         | 
| 709 | 
            +
                        spec_channels,
         | 
| 710 | 
            +
                        inter_channels,
         | 
| 711 | 
            +
                        hidden_channels,
         | 
| 712 | 
            +
                        5,
         | 
| 713 | 
            +
                        1,
         | 
| 714 | 
            +
                        16,
         | 
| 715 | 
            +
                        gin_channels=gin_channels,
         | 
| 716 | 
            +
                    )
         | 
| 717 | 
            +
                    self.flow = ResidualCouplingBlock(
         | 
| 718 | 
            +
                        inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
         | 
| 719 | 
            +
                    )
         | 
| 720 | 
            +
                    self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
         | 
| 721 | 
            +
                    print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
         | 
| 722 | 
            +
             | 
| 723 | 
            +
                def remove_weight_norm(self):
         | 
| 724 | 
            +
                    self.dec.remove_weight_norm()
         | 
| 725 | 
            +
                    self.flow.remove_weight_norm()
         | 
| 726 | 
            +
                    self.enc_q.remove_weight_norm()
         | 
| 727 | 
            +
             | 
| 728 | 
            +
                def forward(
         | 
| 729 | 
            +
                    self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds
         | 
| 730 | 
            +
                ):  # 这里ds是id,[bs,1]
         | 
| 731 | 
            +
                    # print(1,pitch.shape)#[bs,t]
         | 
| 732 | 
            +
                    g = self.emb_g(ds).unsqueeze(-1)  # [b, 256, 1]##1是t,广播的
         | 
| 733 | 
            +
                    m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
         | 
| 734 | 
            +
                    z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
         | 
| 735 | 
            +
                    z_p = self.flow(z, y_mask, g=g)
         | 
| 736 | 
            +
                    z_slice, ids_slice = commons.rand_slice_segments(
         | 
| 737 | 
            +
                        z, y_lengths, self.segment_size
         | 
| 738 | 
            +
                    )
         | 
| 739 | 
            +
                    # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length)
         | 
| 740 | 
            +
                    pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size)
         | 
| 741 | 
            +
                    # print(-2,pitchf.shape,z_slice.shape)
         | 
| 742 | 
            +
                    o = self.dec(z_slice, pitchf, g=g)
         | 
| 743 | 
            +
                    return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
         | 
| 744 | 
            +
             | 
| 745 | 
            +
                def infer(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None):
         | 
| 746 | 
            +
                    g = self.emb_g(sid).unsqueeze(-1)
         | 
| 747 | 
            +
                    m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
         | 
| 748 | 
            +
                    z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
         | 
| 749 | 
            +
                    z = self.flow(z_p, x_mask, g=g, reverse=True)
         | 
| 750 | 
            +
                    o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)
         | 
| 751 | 
            +
                    return o, x_mask, (z, z_p, m_p, logs_p)
         | 
| 752 | 
            +
             | 
| 753 | 
            +
             | 
| 754 | 
            +
            class SynthesizerTrnMs256NSFsid_nono(nn.Module):
         | 
| 755 | 
            +
                def __init__(
         | 
| 756 | 
            +
                    self,
         | 
| 757 | 
            +
                    spec_channels,
         | 
| 758 | 
            +
                    segment_size,
         | 
| 759 | 
            +
                    inter_channels,
         | 
| 760 | 
            +
                    hidden_channels,
         | 
| 761 | 
            +
                    filter_channels,
         | 
| 762 | 
            +
                    n_heads,
         | 
| 763 | 
            +
                    n_layers,
         | 
| 764 | 
            +
                    kernel_size,
         | 
| 765 | 
            +
                    p_dropout,
         | 
| 766 | 
            +
                    resblock,
         | 
| 767 | 
            +
                    resblock_kernel_sizes,
         | 
| 768 | 
            +
                    resblock_dilation_sizes,
         | 
| 769 | 
            +
                    upsample_rates,
         | 
| 770 | 
            +
                    upsample_initial_channel,
         | 
| 771 | 
            +
                    upsample_kernel_sizes,
         | 
| 772 | 
            +
                    spk_embed_dim,
         | 
| 773 | 
            +
                    gin_channels,
         | 
| 774 | 
            +
                    sr=None,
         | 
| 775 | 
            +
                    **kwargs
         | 
| 776 | 
            +
                ):
         | 
| 777 | 
            +
                    super().__init__()
         | 
| 778 | 
            +
                    self.spec_channels = spec_channels
         | 
| 779 | 
            +
                    self.inter_channels = inter_channels
         | 
| 780 | 
            +
                    self.hidden_channels = hidden_channels
         | 
| 781 | 
            +
                    self.filter_channels = filter_channels
         | 
| 782 | 
            +
                    self.n_heads = n_heads
         | 
| 783 | 
            +
                    self.n_layers = n_layers
         | 
| 784 | 
            +
                    self.kernel_size = kernel_size
         | 
| 785 | 
            +
                    self.p_dropout = p_dropout
         | 
| 786 | 
            +
                    self.resblock = resblock
         | 
| 787 | 
            +
                    self.resblock_kernel_sizes = resblock_kernel_sizes
         | 
| 788 | 
            +
                    self.resblock_dilation_sizes = resblock_dilation_sizes
         | 
| 789 | 
            +
                    self.upsample_rates = upsample_rates
         | 
| 790 | 
            +
                    self.upsample_initial_channel = upsample_initial_channel
         | 
| 791 | 
            +
                    self.upsample_kernel_sizes = upsample_kernel_sizes
         | 
| 792 | 
            +
                    self.segment_size = segment_size
         | 
| 793 | 
            +
                    self.gin_channels = gin_channels
         | 
| 794 | 
            +
                    # self.hop_length = hop_length#
         | 
| 795 | 
            +
                    self.spk_embed_dim = spk_embed_dim
         | 
| 796 | 
            +
                    self.enc_p = TextEncoder256(
         | 
| 797 | 
            +
                        inter_channels,
         | 
| 798 | 
            +
                        hidden_channels,
         | 
| 799 | 
            +
                        filter_channels,
         | 
| 800 | 
            +
                        n_heads,
         | 
| 801 | 
            +
                        n_layers,
         | 
| 802 | 
            +
                        kernel_size,
         | 
| 803 | 
            +
                        p_dropout,
         | 
| 804 | 
            +
                        f0=False,
         | 
| 805 | 
            +
                    )
         | 
| 806 | 
            +
                    self.dec = Generator(
         | 
| 807 | 
            +
                        inter_channels,
         | 
| 808 | 
            +
                        resblock,
         | 
| 809 | 
            +
                        resblock_kernel_sizes,
         | 
| 810 | 
            +
                        resblock_dilation_sizes,
         | 
| 811 | 
            +
                        upsample_rates,
         | 
| 812 | 
            +
                        upsample_initial_channel,
         | 
| 813 | 
            +
                        upsample_kernel_sizes,
         | 
| 814 | 
            +
                        gin_channels=gin_channels,
         | 
| 815 | 
            +
                    )
         | 
| 816 | 
            +
                    self.enc_q = PosteriorEncoder(
         | 
| 817 | 
            +
                        spec_channels,
         | 
| 818 | 
            +
                        inter_channels,
         | 
| 819 | 
            +
                        hidden_channels,
         | 
| 820 | 
            +
                        5,
         | 
| 821 | 
            +
                        1,
         | 
| 822 | 
            +
                        16,
         | 
| 823 | 
            +
                        gin_channels=gin_channels,
         | 
| 824 | 
            +
                    )
         | 
| 825 | 
            +
                    self.flow = ResidualCouplingBlock(
         | 
| 826 | 
            +
                        inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
         | 
| 827 | 
            +
                    )
         | 
| 828 | 
            +
                    self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
         | 
| 829 | 
            +
                    print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
         | 
| 830 | 
            +
             | 
| 831 | 
            +
                def remove_weight_norm(self):
         | 
| 832 | 
            +
                    self.dec.remove_weight_norm()
         | 
| 833 | 
            +
                    self.flow.remove_weight_norm()
         | 
| 834 | 
            +
                    self.enc_q.remove_weight_norm()
         | 
| 835 | 
            +
             | 
| 836 | 
            +
                def forward(self, phone, phone_lengths, y, y_lengths, ds):  # 这里ds是id,[bs,1]
         | 
| 837 | 
            +
                    g = self.emb_g(ds).unsqueeze(-1)  # [b, 256, 1]##1是t,广播的
         | 
| 838 | 
            +
                    m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
         | 
| 839 | 
            +
                    z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
         | 
| 840 | 
            +
                    z_p = self.flow(z, y_mask, g=g)
         | 
| 841 | 
            +
                    z_slice, ids_slice = commons.rand_slice_segments(
         | 
| 842 | 
            +
                        z, y_lengths, self.segment_size
         | 
| 843 | 
            +
                    )
         | 
| 844 | 
            +
                    o = self.dec(z_slice, g=g)
         | 
| 845 | 
            +
                    return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
         | 
| 846 | 
            +
             | 
| 847 | 
            +
                def infer(self, phone, phone_lengths, sid, max_len=None):
         | 
| 848 | 
            +
                    g = self.emb_g(sid).unsqueeze(-1)
         | 
| 849 | 
            +
                    m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
         | 
| 850 | 
            +
                    z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
         | 
| 851 | 
            +
                    z = self.flow(z_p, x_mask, g=g, reverse=True)
         | 
| 852 | 
            +
                    o = self.dec((z * x_mask)[:, :, :max_len], g=g)
         | 
| 853 | 
            +
                    return o, x_mask, (z, z_p, m_p, logs_p)
         | 
| 854 | 
            +
             | 
| 855 | 
            +
             | 
| 856 | 
            +
            class SynthesizerTrnMs768NSFsid_nono(nn.Module):
         | 
| 857 | 
            +
                def __init__(
         | 
| 858 | 
            +
                    self,
         | 
| 859 | 
            +
                    spec_channels,
         | 
| 860 | 
            +
                    segment_size,
         | 
| 861 | 
            +
                    inter_channels,
         | 
| 862 | 
            +
                    hidden_channels,
         | 
| 863 | 
            +
                    filter_channels,
         | 
| 864 | 
            +
                    n_heads,
         | 
| 865 | 
            +
                    n_layers,
         | 
| 866 | 
            +
                    kernel_size,
         | 
| 867 | 
            +
                    p_dropout,
         | 
| 868 | 
            +
                    resblock,
         | 
| 869 | 
            +
                    resblock_kernel_sizes,
         | 
| 870 | 
            +
                    resblock_dilation_sizes,
         | 
| 871 | 
            +
                    upsample_rates,
         | 
| 872 | 
            +
                    upsample_initial_channel,
         | 
| 873 | 
            +
                    upsample_kernel_sizes,
         | 
| 874 | 
            +
                    spk_embed_dim,
         | 
| 875 | 
            +
                    gin_channels,
         | 
| 876 | 
            +
                    sr=None,
         | 
| 877 | 
            +
                    **kwargs
         | 
| 878 | 
            +
                ):
         | 
| 879 | 
            +
                    super().__init__()
         | 
| 880 | 
            +
                    self.spec_channels = spec_channels
         | 
| 881 | 
            +
                    self.inter_channels = inter_channels
         | 
| 882 | 
            +
                    self.hidden_channels = hidden_channels
         | 
| 883 | 
            +
                    self.filter_channels = filter_channels
         | 
| 884 | 
            +
                    self.n_heads = n_heads
         | 
| 885 | 
            +
                    self.n_layers = n_layers
         | 
| 886 | 
            +
                    self.kernel_size = kernel_size
         | 
| 887 | 
            +
                    self.p_dropout = p_dropout
         | 
| 888 | 
            +
                    self.resblock = resblock
         | 
| 889 | 
            +
                    self.resblock_kernel_sizes = resblock_kernel_sizes
         | 
| 890 | 
            +
                    self.resblock_dilation_sizes = resblock_dilation_sizes
         | 
| 891 | 
            +
                    self.upsample_rates = upsample_rates
         | 
| 892 | 
            +
                    self.upsample_initial_channel = upsample_initial_channel
         | 
| 893 | 
            +
                    self.upsample_kernel_sizes = upsample_kernel_sizes
         | 
| 894 | 
            +
                    self.segment_size = segment_size
         | 
| 895 | 
            +
                    self.gin_channels = gin_channels
         | 
| 896 | 
            +
                    # self.hop_length = hop_length#
         | 
| 897 | 
            +
                    self.spk_embed_dim = spk_embed_dim
         | 
| 898 | 
            +
                    self.enc_p = TextEncoder768(
         | 
| 899 | 
            +
                        inter_channels,
         | 
| 900 | 
            +
                        hidden_channels,
         | 
| 901 | 
            +
                        filter_channels,
         | 
| 902 | 
            +
                        n_heads,
         | 
| 903 | 
            +
                        n_layers,
         | 
| 904 | 
            +
                        kernel_size,
         | 
| 905 | 
            +
                        p_dropout,
         | 
| 906 | 
            +
                        f0=False,
         | 
| 907 | 
            +
                    )
         | 
| 908 | 
            +
                    self.dec = Generator(
         | 
| 909 | 
            +
                        inter_channels,
         | 
| 910 | 
            +
                        resblock,
         | 
| 911 | 
            +
                        resblock_kernel_sizes,
         | 
| 912 | 
            +
                        resblock_dilation_sizes,
         | 
| 913 | 
            +
                        upsample_rates,
         | 
| 914 | 
            +
                        upsample_initial_channel,
         | 
| 915 | 
            +
                        upsample_kernel_sizes,
         | 
| 916 | 
            +
                        gin_channels=gin_channels,
         | 
| 917 | 
            +
                    )
         | 
| 918 | 
            +
                    self.enc_q = PosteriorEncoder(
         | 
| 919 | 
            +
                        spec_channels,
         | 
| 920 | 
            +
                        inter_channels,
         | 
| 921 | 
            +
                        hidden_channels,
         | 
| 922 | 
            +
                        5,
         | 
| 923 | 
            +
                        1,
         | 
| 924 | 
            +
                        16,
         | 
| 925 | 
            +
                        gin_channels=gin_channels,
         | 
| 926 | 
            +
                    )
         | 
| 927 | 
            +
                    self.flow = ResidualCouplingBlock(
         | 
| 928 | 
            +
                        inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
         | 
| 929 | 
            +
                    )
         | 
| 930 | 
            +
                    self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
         | 
| 931 | 
            +
                    print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
         | 
| 932 | 
            +
             | 
| 933 | 
            +
                def remove_weight_norm(self):
         | 
| 934 | 
            +
                    self.dec.remove_weight_norm()
         | 
| 935 | 
            +
                    self.flow.remove_weight_norm()
         | 
| 936 | 
            +
                    self.enc_q.remove_weight_norm()
         | 
| 937 | 
            +
             | 
| 938 | 
            +
                def forward(self, phone, phone_lengths, y, y_lengths, ds):  # 这里ds是id,[bs,1]
         | 
| 939 | 
            +
                    g = self.emb_g(ds).unsqueeze(-1)  # [b, 256, 1]##1是t,广播的
         | 
| 940 | 
            +
                    m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
         | 
| 941 | 
            +
                    z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
         | 
| 942 | 
            +
                    z_p = self.flow(z, y_mask, g=g)
         | 
| 943 | 
            +
                    z_slice, ids_slice = commons.rand_slice_segments(
         | 
| 944 | 
            +
                        z, y_lengths, self.segment_size
         | 
| 945 | 
            +
                    )
         | 
| 946 | 
            +
                    o = self.dec(z_slice, g=g)
         | 
| 947 | 
            +
                    return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
         | 
| 948 | 
            +
             | 
| 949 | 
            +
                def infer(self, phone, phone_lengths, sid, max_len=None):
         | 
| 950 | 
            +
                    g = self.emb_g(sid).unsqueeze(-1)
         | 
| 951 | 
            +
                    m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
         | 
| 952 | 
            +
                    z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
         | 
| 953 | 
            +
                    z = self.flow(z_p, x_mask, g=g, reverse=True)
         | 
| 954 | 
            +
                    o = self.dec((z * x_mask)[:, :, :max_len], g=g)
         | 
| 955 | 
            +
                    return o, x_mask, (z, z_p, m_p, logs_p)
         | 
| 956 | 
            +
             | 
| 957 | 
            +
             | 
| 958 | 
            +
            class MultiPeriodDiscriminator(torch.nn.Module):
         | 
| 959 | 
            +
                def __init__(self, use_spectral_norm=False):
         | 
| 960 | 
            +
                    super(MultiPeriodDiscriminator, self).__init__()
         | 
| 961 | 
            +
                    periods = [2, 3, 5, 7, 11, 17]
         | 
| 962 | 
            +
                    # periods = [3, 5, 7, 11, 17, 23, 37]
         | 
| 963 | 
            +
             | 
| 964 | 
            +
                    discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
         | 
| 965 | 
            +
                    discs = discs + [
         | 
| 966 | 
            +
                        DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
         | 
| 967 | 
            +
                    ]
         | 
| 968 | 
            +
                    self.discriminators = nn.ModuleList(discs)
         | 
| 969 | 
            +
             | 
| 970 | 
            +
                def forward(self, y, y_hat):
         | 
| 971 | 
            +
                    y_d_rs = []  #
         | 
| 972 | 
            +
                    y_d_gs = []
         | 
| 973 | 
            +
                    fmap_rs = []
         | 
| 974 | 
            +
                    fmap_gs = []
         | 
| 975 | 
            +
                    for i, d in enumerate(self.discriminators):
         | 
| 976 | 
            +
                        y_d_r, fmap_r = d(y)
         | 
| 977 | 
            +
                        y_d_g, fmap_g = d(y_hat)
         | 
| 978 | 
            +
                        # for j in range(len(fmap_r)):
         | 
| 979 | 
            +
                        #     print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
         | 
| 980 | 
            +
                        y_d_rs.append(y_d_r)
         | 
| 981 | 
            +
                        y_d_gs.append(y_d_g)
         | 
| 982 | 
            +
                        fmap_rs.append(fmap_r)
         | 
| 983 | 
            +
                        fmap_gs.append(fmap_g)
         | 
| 984 | 
            +
             | 
| 985 | 
            +
                    return y_d_rs, y_d_gs, fmap_rs, fmap_gs
         | 
| 986 | 
            +
             | 
| 987 | 
            +
             | 
| 988 | 
            +
            class MultiPeriodDiscriminatorV2(torch.nn.Module):
         | 
| 989 | 
            +
                def __init__(self, use_spectral_norm=False):
         | 
| 990 | 
            +
                    super(MultiPeriodDiscriminatorV2, self).__init__()
         | 
| 991 | 
            +
                    # periods = [2, 3, 5, 7, 11, 17]
         | 
| 992 | 
            +
                    periods = [2, 3, 5, 7, 11, 17, 23, 37]
         | 
| 993 | 
            +
             | 
| 994 | 
            +
                    discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
         | 
| 995 | 
            +
                    discs = discs + [
         | 
| 996 | 
            +
                        DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
         | 
| 997 | 
            +
                    ]
         | 
| 998 | 
            +
                    self.discriminators = nn.ModuleList(discs)
         | 
| 999 | 
            +
             | 
| 1000 | 
            +
                def forward(self, y, y_hat):
         | 
| 1001 | 
            +
                    y_d_rs = []  #
         | 
| 1002 | 
            +
                    y_d_gs = []
         | 
| 1003 | 
            +
                    fmap_rs = []
         | 
| 1004 | 
            +
                    fmap_gs = []
         | 
| 1005 | 
            +
                    for i, d in enumerate(self.discriminators):
         | 
| 1006 | 
            +
                        y_d_r, fmap_r = d(y)
         | 
| 1007 | 
            +
                        y_d_g, fmap_g = d(y_hat)
         | 
| 1008 | 
            +
                        # for j in range(len(fmap_r)):
         | 
| 1009 | 
            +
                        #     print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
         | 
| 1010 | 
            +
                        y_d_rs.append(y_d_r)
         | 
| 1011 | 
            +
                        y_d_gs.append(y_d_g)
         | 
| 1012 | 
            +
                        fmap_rs.append(fmap_r)
         | 
| 1013 | 
            +
                        fmap_gs.append(fmap_g)
         | 
| 1014 | 
            +
             | 
| 1015 | 
            +
                    return y_d_rs, y_d_gs, fmap_rs, fmap_gs
         | 
| 1016 | 
            +
             | 
| 1017 | 
            +
             | 
| 1018 | 
            +
            class DiscriminatorS(torch.nn.Module):
         | 
| 1019 | 
            +
                def __init__(self, use_spectral_norm=False):
         | 
| 1020 | 
            +
                    super(DiscriminatorS, self).__init__()
         | 
| 1021 | 
            +
                    norm_f = weight_norm if use_spectral_norm == False else spectral_norm
         | 
| 1022 | 
            +
                    self.convs = nn.ModuleList(
         | 
| 1023 | 
            +
                        [
         | 
| 1024 | 
            +
                            norm_f(Conv1d(1, 16, 15, 1, padding=7)),
         | 
| 1025 | 
            +
                            norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
         | 
| 1026 | 
            +
                            norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
         | 
| 1027 | 
            +
                            norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
         | 
| 1028 | 
            +
                            norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
         | 
| 1029 | 
            +
                            norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
         | 
| 1030 | 
            +
                        ]
         | 
| 1031 | 
            +
                    )
         | 
| 1032 | 
            +
                    self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
         | 
| 1033 | 
            +
             | 
| 1034 | 
            +
                def forward(self, x):
         | 
| 1035 | 
            +
                    fmap = []
         | 
| 1036 | 
            +
             | 
| 1037 | 
            +
                    for l in self.convs:
         | 
| 1038 | 
            +
                        x = l(x)
         | 
| 1039 | 
            +
                        x = F.leaky_relu(x, modules.LRELU_SLOPE)
         | 
| 1040 | 
            +
                        fmap.append(x)
         | 
| 1041 | 
            +
                    x = self.conv_post(x)
         | 
| 1042 | 
            +
                    fmap.append(x)
         | 
| 1043 | 
            +
                    x = torch.flatten(x, 1, -1)
         | 
| 1044 | 
            +
             | 
| 1045 | 
            +
                    return x, fmap
         | 
| 1046 | 
            +
             | 
| 1047 | 
            +
             | 
| 1048 | 
            +
            class DiscriminatorP(torch.nn.Module):
         | 
| 1049 | 
            +
                def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
         | 
| 1050 | 
            +
                    super(DiscriminatorP, self).__init__()
         | 
| 1051 | 
            +
                    self.period = period
         | 
| 1052 | 
            +
                    self.use_spectral_norm = use_spectral_norm
         | 
| 1053 | 
            +
                    norm_f = weight_norm if use_spectral_norm == False else spectral_norm
         | 
| 1054 | 
            +
                    self.convs = nn.ModuleList(
         | 
| 1055 | 
            +
                        [
         | 
| 1056 | 
            +
                            norm_f(
         | 
| 1057 | 
            +
                                Conv2d(
         | 
| 1058 | 
            +
                                    1,
         | 
| 1059 | 
            +
                                    32,
         | 
| 1060 | 
            +
                                    (kernel_size, 1),
         | 
| 1061 | 
            +
                                    (stride, 1),
         | 
| 1062 | 
            +
                                    padding=(get_padding(kernel_size, 1), 0),
         | 
| 1063 | 
            +
                                )
         | 
| 1064 | 
            +
                            ),
         | 
| 1065 | 
            +
                            norm_f(
         | 
| 1066 | 
            +
                                Conv2d(
         | 
| 1067 | 
            +
                                    32,
         | 
| 1068 | 
            +
                                    128,
         | 
| 1069 | 
            +
                                    (kernel_size, 1),
         | 
| 1070 | 
            +
                                    (stride, 1),
         | 
| 1071 | 
            +
                                    padding=(get_padding(kernel_size, 1), 0),
         | 
| 1072 | 
            +
                                )
         | 
| 1073 | 
            +
                            ),
         | 
| 1074 | 
            +
                            norm_f(
         | 
| 1075 | 
            +
                                Conv2d(
         | 
| 1076 | 
            +
                                    128,
         | 
| 1077 | 
            +
                                    512,
         | 
| 1078 | 
            +
                                    (kernel_size, 1),
         | 
| 1079 | 
            +
                                    (stride, 1),
         | 
| 1080 | 
            +
                                    padding=(get_padding(kernel_size, 1), 0),
         | 
| 1081 | 
            +
                                )
         | 
| 1082 | 
            +
                            ),
         | 
| 1083 | 
            +
                            norm_f(
         | 
| 1084 | 
            +
                                Conv2d(
         | 
| 1085 | 
            +
                                    512,
         | 
| 1086 | 
            +
                                    1024,
         | 
| 1087 | 
            +
                                    (kernel_size, 1),
         | 
| 1088 | 
            +
                                    (stride, 1),
         | 
| 1089 | 
            +
                                    padding=(get_padding(kernel_size, 1), 0),
         | 
| 1090 | 
            +
                                )
         | 
| 1091 | 
            +
                            ),
         | 
| 1092 | 
            +
                            norm_f(
         | 
| 1093 | 
            +
                                Conv2d(
         | 
| 1094 | 
            +
                                    1024,
         | 
| 1095 | 
            +
                                    1024,
         | 
| 1096 | 
            +
                                    (kernel_size, 1),
         | 
| 1097 | 
            +
                                    1,
         | 
| 1098 | 
            +
                                    padding=(get_padding(kernel_size, 1), 0),
         | 
| 1099 | 
            +
                                )
         | 
| 1100 | 
            +
                            ),
         | 
| 1101 | 
            +
                        ]
         | 
| 1102 | 
            +
                    )
         | 
| 1103 | 
            +
                    self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
         | 
| 1104 | 
            +
             | 
| 1105 | 
            +
                def forward(self, x):
         | 
| 1106 | 
            +
                    fmap = []
         | 
| 1107 | 
            +
             | 
| 1108 | 
            +
                    # 1d to 2d
         | 
| 1109 | 
            +
                    b, c, t = x.shape
         | 
| 1110 | 
            +
                    if t % self.period != 0:  # pad first
         | 
| 1111 | 
            +
                        n_pad = self.period - (t % self.period)
         | 
| 1112 | 
            +
                        x = F.pad(x, (0, n_pad), "reflect")
         | 
| 1113 | 
            +
                        t = t + n_pad
         | 
| 1114 | 
            +
                    x = x.view(b, c, t // self.period, self.period)
         | 
| 1115 | 
            +
             | 
| 1116 | 
            +
                    for l in self.convs:
         | 
| 1117 | 
            +
                        x = l(x)
         | 
| 1118 | 
            +
                        x = F.leaky_relu(x, modules.LRELU_SLOPE)
         | 
| 1119 | 
            +
                        fmap.append(x)
         | 
| 1120 | 
            +
                    x = self.conv_post(x)
         | 
| 1121 | 
            +
                    fmap.append(x)
         | 
| 1122 | 
            +
                    x = torch.flatten(x, 1, -1)
         | 
| 1123 | 
            +
             | 
| 1124 | 
            +
                    return x, fmap
         | 
    	
        lib/infer_pack/onnx_inference.py
    CHANGED
    
    | @@ -39,7 +39,9 @@ def get_f0_predictor(f0_predictor, hop_length, sampling_rate, **kargs): | |
| 39 | 
             
                        hop_length=hop_length, sampling_rate=sampling_rate
         | 
| 40 | 
             
                    )
         | 
| 41 | 
             
                elif f0_predictor == "harvest":
         | 
| 42 | 
            -
                    from lib.infer_pack.modules.F0Predictor.HarvestF0Predictor import  | 
|  | |
|  | |
| 43 |  | 
| 44 | 
             
                    f0_predictor_object = HarvestF0Predictor(
         | 
| 45 | 
             
                        hop_length=hop_length, sampling_rate=sampling_rate
         | 
|  | |
| 39 | 
             
                        hop_length=hop_length, sampling_rate=sampling_rate
         | 
| 40 | 
             
                    )
         | 
| 41 | 
             
                elif f0_predictor == "harvest":
         | 
| 42 | 
            +
                    from lib.infer_pack.modules.F0Predictor.HarvestF0Predictor import (
         | 
| 43 | 
            +
                        HarvestF0Predictor,
         | 
| 44 | 
            +
                    )
         | 
| 45 |  | 
| 46 | 
             
                    f0_predictor_object = HarvestF0Predictor(
         | 
| 47 | 
             
                        hop_length=hop_length, sampling_rate=sampling_rate
         | 
    	
        requirements.txt
    CHANGED
    
    | @@ -8,7 +8,7 @@ librosa==0.9.1 | |
| 8 | 
             
            fairseq==0.12.2
         | 
| 9 | 
             
            faiss-cpu==1.7.3
         | 
| 10 | 
             
            gradio==3.36.1
         | 
| 11 | 
            -
            pyworld | 
| 12 | 
             
            soundfile>=0.12.1
         | 
| 13 | 
             
            praat-parselmouth>=0.4.2
         | 
| 14 | 
             
            httpx==0.23.0
         | 
|  | |
| 8 | 
             
            fairseq==0.12.2
         | 
| 9 | 
             
            faiss-cpu==1.7.3
         | 
| 10 | 
             
            gradio==3.36.1
         | 
| 11 | 
            +
            pyworld==0.3.2
         | 
| 12 | 
             
            soundfile>=0.12.1
         | 
| 13 | 
             
            praat-parselmouth>=0.4.2
         | 
| 14 | 
             
            httpx==0.23.0
         | 
    	
        vc_infer_pipeline.py
    CHANGED
    
    | @@ -1,4 +1,4 @@ | |
| 1 | 
            -
            import numpy as np, parselmouth, torch, pdb
         | 
| 2 | 
             
            from time import time as ttime
         | 
| 3 | 
             
            import torch.nn.functional as F
         | 
| 4 | 
             
            import scipy.signal as signal
         | 
| @@ -6,6 +6,9 @@ import pyworld, os, traceback, faiss, librosa, torchcrepe | |
| 6 | 
             
            from scipy import signal
         | 
| 7 | 
             
            from functools import lru_cache
         | 
| 8 |  | 
|  | |
|  | |
|  | |
| 9 | 
             
            bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000)
         | 
| 10 |  | 
| 11 | 
             
            input_audio_path2wav = {}
         | 
| @@ -124,6 +127,15 @@ class VC(object): | |
| 124 | 
             
                        f0 = torchcrepe.filter.mean(f0, 3)
         | 
| 125 | 
             
                        f0[pd < 0.1] = 0
         | 
| 126 | 
             
                        f0 = f0[0].cpu().numpy()
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 127 | 
             
                    f0 *= pow(2, f0_up_key / 12)
         | 
| 128 | 
             
                    # with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
         | 
| 129 | 
             
                    tf0 = self.sr // self.window  # 每秒f0点数
         | 
|  | |
| 1 | 
            +
            import numpy as np, parselmouth, torch, pdb, sys, os
         | 
| 2 | 
             
            from time import time as ttime
         | 
| 3 | 
             
            import torch.nn.functional as F
         | 
| 4 | 
             
            import scipy.signal as signal
         | 
|  | |
| 6 | 
             
            from scipy import signal
         | 
| 7 | 
             
            from functools import lru_cache
         | 
| 8 |  | 
| 9 | 
            +
            now_dir = os.getcwd()
         | 
| 10 | 
            +
            sys.path.append(now_dir)
         | 
| 11 | 
            +
             | 
| 12 | 
             
            bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000)
         | 
| 13 |  | 
| 14 | 
             
            input_audio_path2wav = {}
         | 
|  | |
| 127 | 
             
                        f0 = torchcrepe.filter.mean(f0, 3)
         | 
| 128 | 
             
                        f0[pd < 0.1] = 0
         | 
| 129 | 
             
                        f0 = f0[0].cpu().numpy()
         | 
| 130 | 
            +
                    elif f0_method == "rmvpe":
         | 
| 131 | 
            +
                        if hasattr(self, "model_rmvpe") == False:
         | 
| 132 | 
            +
                            from rmvpe import RMVPE
         | 
| 133 | 
            +
             | 
| 134 | 
            +
                            print("loading rmvpe model")
         | 
| 135 | 
            +
                            self.model_rmvpe = RMVPE(
         | 
| 136 | 
            +
                                "rmvpe.pt", is_half=self.is_half, device=self.device
         | 
| 137 | 
            +
                            )
         | 
| 138 | 
            +
                        f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03)
         | 
| 139 | 
             
                    f0 *= pow(2, f0_up_key / 12)
         | 
| 140 | 
             
                    # with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()]))
         | 
| 141 | 
             
                    tf0 = self.sr // self.window  # 每秒f0点数
         | 
