ParaLlama-p-small / configuration_nano.py
crumb's picture
Upload model
cc554fd verified
raw
history blame contribute delete
No virus
3.22 kB
from collections import OrderedDict
from typing import Any, List, Mapping, Optional
from transformers import PreTrainedTokenizer, TensorType, is_torch_available
from transformers.configuration_utils import PretrainedConfig
from transformers.utils import logging
logger = logging.get_logger(__name__)
class NanoConfig(PretrainedConfig):
model_type = "nano"
keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {
"hidden_size": "hidden_size",
"max_position_embeddings": "max_position_embeddings",
"num_attention_heads": "num_attention_heads",
"num_hidden_layers": "num_hidden_layers",
}
def __init__(
self,
vocab_size=32000,
max_position_embeddings=2048,
expanded_wte_size=None,
expanded_lm_head_size=None,
hidden_size=768,
kv_hidden_size=None, # in case you want to use cross-attention
num_hidden_layers=10,
num_attention_heads=12,
intermediate_size=None,
activation_function="silu",
layer_norm_epsilon=1e-6,
initializer_range=0.02,
use_cache=True,
bos_token_id=1,
eos_token_id=2,
combined_qkv=True,
use_bias=False,
lm_head_projection_bias=False,
lm_head_bias=False,
layernorm="llamarmsnorm", # layernorm, llamarmsnorm
rope_scaling=None,
rope_theta=10000,
ffn="llama-like",
experimental_full_adaption_rank = None, # 8
full_adaptation_has_pre_proj = True,
pre_proj_dim = 1536,
full_adaptation_type="no", # "lora", "no", "linear", "linear-r", "linear-ra"
tie_word_embeddings=False,
**kwargs,
):
self.pre_proj_dim = pre_proj_dim
self.full_adaptation_has_pre_proj = full_adaptation_has_pre_proj
self.full_adaptation_type = full_adaptation_type
self.tie_word_embeddings = tie_word_embeddings
self.experimental_full_adaption_rank = experimental_full_adaption_rank
self.ffn = ffn
self.rope_theta=rope_theta
self.layernorm = layernorm
self.rope_scaling=rope_scaling
self.lm_head_projection_bias = lm_head_projection_bias
self.kv_hidden_size = kv_hidden_size
self.lm_head_bias = lm_head_bias
self.use_bias = use_bias
self.expanded_wte_size = expanded_wte_size
self.expanded_lm_head_size = expanded_lm_head_size
self.combined_qkv = combined_qkv
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.intermediate_size = (
intermediate_size if intermediate_size is not None else hidden_size * 4
)
self.activation_function = activation_function
self.layer_norm_epsilon = layer_norm_epsilon
self.initializer_range = initializer_range
self.use_cache = use_cache
self.bos_token_id = bos_token_id
self.eos_token_id = eos_token_id
super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)