SmolLM2-135m / config.py
gitesh-grover's picture
Upload 6 files
960a17b verified
from dataclasses import dataclass
@dataclass
class Config:
seed: int = 49
vocab_size: int = 49152 # it should match the vocab size of the tokenizer
num_hidden_layers: int = 30 # number of layers
num_attention_heads: int = 9 # number of heads
num_key_value_heads: int = 3 # number of key and value heads
nn_embed: int = 576 # embedding dimension or hidden_size
max_sequence_len: int = 2048 # max token sequence length (for pos embedding) # Block size
ffn_intermediate_size: int = 1536
rms_norm_eps: float = 1.0e-05
nn_top_k: int = 50 # top k for the model
nn_temperature: float = 1.0 # temperature for the model
tokenizer_name_or_path: str = "HuggingFaceTB/cosmo2-tokenizer"
checkpoint_interval: int = 1000
checkpoints_path = "checkpoints"
# init_method_std: 0.041666666666666664
nn_train_tok_seq: int = 65 # Actual training token sequence block size 64 + 1 as we are shifting the targets by 1
# nn_mlp_expansion: int = 4 # Expansion in the MLP layer
batch_size: int = 64
# train_tok_size: int = 32
# saved_model_path = 'data/model_tf.pth'
# train_input_file = 'data/input.txt'
optimizer_learning_rate_scheduler_learning_rate: float = 0.003
optimizer_learning_rate_scheduler_lr_decay_starting_step: int = 1600000
optimizer_learning_rate_scheduler_lr_decay_steps: int = 400000
optimizer_learning_rate_scheduler_lr_decay_style: str = "linear"
optimizer_learning_rate_scheduler_lr_warmup_steps: int = 2000
optimizer_learning_rate_scheduler_lr_warmup_style: str = "linear"
optimizer_learning_rate_scheduler_min_decay_lr: float = 0
optimizer_factory_adam_beta1: float = 0.9
optimizer_factory_adam_beta2: float = 0.95
optimizer_factory_adam_eps: float = 1.0e-08
optimizer_factory_name: str = "adamW"
optimizer_factory_torch_adam_is_fused: bool = True
optimizer_weight_decay: float = 0.01
optimizer_zero_stage: int = 0
optimizer_clip_grad: float = 1.0