| from transformers import GPT2Config | |
| class NomicBertConfig(GPT2Config): | |
| model_type = "nomic_bert" | |
| def __init__( | |
| self, | |
| prenorm=False, | |
| parallel_block=False, | |
| parallel_block_tied_norm=False, | |
| rotary_emb_fraction=0.0, | |
| fused_dropout_add_ln=False, | |
| fused_bias_fc=False, | |
| use_flash_attn=False, | |
| use_xentropy=False, | |
| qkv_proj_bias=True, | |
| rotary_emb_base=10_000, | |
| rotary_emb_scale_base=None, | |
| rotary_emb_interleaved=False, | |
| mlp_fc1_bias=True, | |
| mlp_fc2_bias=True, | |
| use_rms_norm=False, | |
| causal=False, | |
| type_vocab_size=2, | |
| dense_seq_output=True, | |
| pad_vocab_size_multiple=1, | |
| tie_word_embeddings=True, | |
| rotary_scaling_factor=None, | |
| max_trained_positions=2048, | |
| **kwargs, | |
| ): | |
| self.prenorm = prenorm | |
| self.parallel_block = parallel_block | |
| self.parallel_block_tied_norm = parallel_block_tied_norm | |
| self.rotary_emb_fraction = rotary_emb_fraction | |
| self.tie_word_embeddings = tie_word_embeddings | |
| self.fused_dropout_add_ln = fused_dropout_add_ln | |
| self.fused_bias_fc = fused_bias_fc | |
| self.use_flash_attn = use_flash_attn | |
| self.use_xentropy = use_xentropy | |
| self.qkv_proj_bias = qkv_proj_bias | |
| self.rotary_emb_base = rotary_emb_base | |
| self.rotary_emb_scale_base = rotary_emb_scale_base | |
| self.rotary_emb_interleaved = rotary_emb_interleaved | |
| self.mlp_fc1_bias = mlp_fc1_bias | |
| self.mlp_fc2_bias = mlp_fc2_bias | |
| self.use_rms_norm = use_rms_norm | |
| self.causal = causal | |
| self.type_vocab_size = type_vocab_size | |
| self.dense_seq_output = dense_seq_output | |
| self.pad_vocab_size_multiple = pad_vocab_size_multiple | |
| self.rotary_scaling_factor = rotary_scaling_factor | |
| self.max_trained_positions = max_trained_positions | |
| super().__init__(**kwargs) |