| import torch | |
| import logging | |
| logging.getLogger("numba").setLevel(logging.WARNING) | |
| from transformers import ( | |
| Wav2Vec2FeatureExtractor, | |
| HubertModel, | |
| HubertConfig, | |
| AutoConfig | |
| ) | |
| import torch.nn as nn | |
| class CNHubert(nn.Module): | |
| def __init__(self, hubert_config_dict: dict[str, any], extractor_config_dict: dict[str, any]): | |
| super().__init__() | |
| self.model = HubertModel(HubertConfig.from_dict(hubert_config_dict)) | |
| self.feature_extractor = Wav2Vec2FeatureExtractor.from_dict(extractor_config_dict) | |
| def forward(self, x): | |
| input_values = self.feature_extractor( | |
| x, return_tensors="pt", sampling_rate=16000 | |
| ).input_values.to(x.device) | |
| feats = self.model(input_values)["last_hidden_state"] | |
| return feats | |
| # class CNHubertLarge(nn.Module): | |
| # def __init__(self): | |
| # super().__init__() | |
| # self.model = HubertModel.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-hubert-large") | |
| # self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-hubert-large") | |
| # def forward(self, x): | |
| # input_values = self.feature_extractor(x, return_tensors="pt", sampling_rate=16000).input_values.to(x.device) | |
| # feats = self.model(input_values)["last_hidden_state"] | |
| # return feats | |
| # | |
| # class CVec(nn.Module): | |
| # def __init__(self): | |
| # super().__init__() | |
| # self.model = HubertModel.from_pretrained("/data/docker/liujing04/vc-webui-big/hubert_base") | |
| # self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("/data/docker/liujing04/vc-webui-big/hubert_base") | |
| # def forward(self, x): | |
| # input_values = self.feature_extractor(x, return_tensors="pt", sampling_rate=16000).input_values.to(x.device) | |
| # feats = self.model(input_values)["last_hidden_state"] | |
| # return feats | |
| # | |
| # class cnw2v2base(nn.Module): | |
| # def __init__(self): | |
| # super().__init__() | |
| # self.model = Wav2Vec2Model.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-wav2vec2-base") | |
| # self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-wav2vec2-base") | |
| # def forward(self, x): | |
| # input_values = self.feature_extractor(x, return_tensors="pt", sampling_rate=16000).input_values.to(x.device) | |
| # feats = self.model(input_values)["last_hidden_state"] | |
| # return feats | |
| # def get_large_model(): | |
| # model = CNHubertLarge() | |
| # model.eval() | |
| # return model | |
| # | |
| # def get_model_cvec(): | |
| # model = CVec() | |
| # model.eval() | |
| # return model | |
| # | |
| # def get_model_cnw2v2base(): | |
| # model = cnw2v2base() | |
| # model.eval() | |
| # return model | |