| 
							 | 
						from transformers import PretrainedConfig | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						class RECAST8b_llama(PretrainedConfig): | 
					
					
						
						| 
							 | 
						    model_type = "recast8b_llama" | 
					
					
						
						| 
							 | 
						    attribute_map = { | 
					
					
						
						| 
							 | 
						        "hidden_size": "hidden_size", | 
					
					
						
						| 
							 | 
						        "num_attention_heads": "num_attention_heads", | 
					
					
						
						| 
							 | 
						    } | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						    def __init__( | 
					
					
						
						| 
							 | 
						        self, | 
					
					
						
						| 
							 | 
						        vocab_size=128256, | 
					
					
						
						| 
							 | 
						        hidden_size=2048, | 
					
					
						
						| 
							 | 
						        intermediate_size=8192, | 
					
					
						
						| 
							 | 
						        num_hidden_layers=16, | 
					
					
						
						| 
							 | 
						        num_attention_heads=32, | 
					
					
						
						| 
							 | 
						        num_key_value_heads=8, | 
					
					
						
						| 
							 | 
						        hidden_act="silu", | 
					
					
						
						| 
							 | 
						        max_position_embeddings=131072, | 
					
					
						
						| 
							 | 
						        initializer_range=0.02, | 
					
					
						
						| 
							 | 
						        rms_norm_eps=1e-5, | 
					
					
						
						| 
							 | 
						        use_cache=True, | 
					
					
						
						| 
							 | 
						        pad_token_id=None, | 
					
					
						
						| 
							 | 
						        bos_token_id=128000, | 
					
					
						
						| 
							 | 
						        eos_token_id=128001, | 
					
					
						
						| 
							 | 
						        pretraining_tp=1, | 
					
					
						
						| 
							 | 
						        tie_word_embeddings=False, | 
					
					
						
						| 
							 | 
						        rope_theta=500000.0, | 
					
					
						
						| 
							 | 
						        rope_scaling={ | 
					
					
						
						| 
							 | 
						            "factor": 32.0, | 
					
					
						
						| 
							 | 
						            "low_freq_factor": 1.0, | 
					
					
						
						| 
							 | 
						            "high_freq_factor": 4.0, | 
					
					
						
						| 
							 | 
						            "original_max_position_embeddings": 8192, | 
					
					
						
						| 
							 | 
						            "rope_type": "llama3", | 
					
					
						
						| 
							 | 
						        }, | 
					
					
						
						| 
							 | 
						        attention_bias=False, | 
					
					
						
						| 
							 | 
						        attention_dropout=0.0, | 
					
					
						
						| 
							 | 
						        mlp_bias=False, | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        num_templates=2, | 
					
					
						
						| 
							 | 
						        num_groups=8, | 
					
					
						
						| 
							 | 
						        coef_height=4, | 
					
					
						
						| 
							 | 
						        num_cf=1, | 
					
					
						
						| 
							 | 
						        torch_dtype="bfloat16", | 
					
					
						
						| 
							 | 
						        **kwargs | 
					
					
						
						| 
							 | 
						    ): | 
					
					
						
						| 
							 | 
						        self.vocab_size = vocab_size | 
					
					
						
						| 
							 | 
						        self.max_position_embeddings = max_position_embeddings | 
					
					
						
						| 
							 | 
						        self.hidden_size = hidden_size | 
					
					
						
						| 
							 | 
						        self.intermediate_size = intermediate_size | 
					
					
						
						| 
							 | 
						        self.num_hidden_layers = num_hidden_layers | 
					
					
						
						| 
							 | 
						        self.num_attention_heads = num_attention_heads | 
					
					
						
						| 
							 | 
						        self.num_key_value_heads = num_key_value_heads | 
					
					
						
						| 
							 | 
						        self.hidden_act = hidden_act | 
					
					
						
						| 
							 | 
						        self.initializer_range = initializer_range | 
					
					
						
						| 
							 | 
						        self.rms_norm_eps = rms_norm_eps | 
					
					
						
						| 
							 | 
						        self.pretraining_tp = pretraining_tp | 
					
					
						
						| 
							 | 
						        self.use_cache = use_cache | 
					
					
						
						| 
							 | 
						        self.mlp_bias = mlp_bias | 
					
					
						
						| 
							 | 
						        self.attention_bias = attention_bias | 
					
					
						
						| 
							 | 
						        self.attention_dropout = attention_dropout | 
					
					
						
						| 
							 | 
						        self.rope_theta = rope_theta | 
					
					
						
						| 
							 | 
						        self.rope_scaling = rope_scaling | 
					
					
						
						| 
							 | 
						        self.torch_dtype = torch_dtype | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						         | 
					
					
						
						| 
							 | 
						        self.num_templates = num_templates | 
					
					
						
						| 
							 | 
						        self.num_groups = num_groups | 
					
					
						
						| 
							 | 
						        self.coef_height = coef_height | 
					
					
						
						| 
							 | 
						        self.num_cf = num_cf | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						        super().__init__( | 
					
					
						
						| 
							 | 
						            pad_token_id=pad_token_id, | 
					
					
						
						| 
							 | 
						            bos_token_id=bos_token_id, | 
					
					
						
						| 
							 | 
						            eos_token_id=eos_token_id, | 
					
					
						
						| 
							 | 
						            tie_word_embeddings=tie_word_embeddings, | 
					
					
						
						| 
							 | 
						            **kwargs | 
					
					
						
						| 
							 | 
						        ) | 
					
					
						
						| 
							 | 
						
 |