|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
""" GLMAudio model configuration """ |
|
|
|
from typing import Dict |
|
|
|
from transformers.configuration_utils import PretrainedConfig |
|
from transformers.utils import logging |
|
|
|
logger = logging.get_logger(__name__) |
|
|
|
|
|
class BailingTalkerConfig(PretrainedConfig): |
|
|
|
|
|
|
|
|
|
|
|
def __init__( |
|
self, |
|
pretrained_model_path=None, |
|
qa_model_hidden_size=2048, |
|
vocab_size=184445, |
|
text_vocab_size=151677, |
|
audio_vocab_size=32768, |
|
vp_feature_size=192, |
|
vp_kernel_size=1, |
|
vp_stride=1, |
|
s3bpe_tokenizer=None, |
|
**kwargs |
|
): |
|
self.pretrained_model_path = pretrained_model_path |
|
self.qa_model_hidden_size = qa_model_hidden_size |
|
self.vocab_size = vocab_size |
|
self.text_vocab_size = text_vocab_size |
|
self.audio_vocab_size = audio_vocab_size |
|
self.vp_feature_size = vp_feature_size |
|
self.vp_kernel_size = vp_kernel_size |
|
self.vp_stride = vp_stride |
|
self.s3bpe_tokenizer = s3bpe_tokenizer |
|
super().__init__( |
|
**kwargs |
|
) |
|
|