Spaces:
Sleeping
Sleeping
from dataclasses import dataclass | |
class Config: | |
seed: int = 49 | |
vocab_size: int = 49152 # it should match the vocab size of the tokenizer | |
num_hidden_layers: int = 30 # number of layers | |
num_attention_heads: int = 9 # number of heads | |
num_key_value_heads: int = 3 # number of key and value heads | |
nn_embed: int = 576 # embedding dimension or hidden_size | |
max_sequence_len: int = 2048 # max token sequence length (for pos embedding) # Block size | |
ffn_intermediate_size: int = 1536 | |
rms_norm_eps: float = 1.0e-05 | |
nn_top_k: int = 50 # top k for the model | |
nn_temperature: float = 1.0 # temperature for the model | |
tokenizer_name_or_path: str = "HuggingFaceTB/cosmo2-tokenizer" | |
checkpoint_interval: int = 1000 | |
checkpoints_path = "checkpoints" | |
# init_method_std: 0.041666666666666664 | |
nn_train_tok_seq: int = 65 # Actual training token sequence block size 64 + 1 as we are shifting the targets by 1 | |
# nn_mlp_expansion: int = 4 # Expansion in the MLP layer | |
batch_size: int = 64 | |
# train_tok_size: int = 32 | |
# saved_model_path = 'data/model_tf.pth' | |
# train_input_file = 'data/input.txt' | |
optimizer_learning_rate_scheduler_learning_rate: float = 0.003 | |
optimizer_learning_rate_scheduler_lr_decay_starting_step: int = 1600000 | |
optimizer_learning_rate_scheduler_lr_decay_steps: int = 400000 | |
optimizer_learning_rate_scheduler_lr_decay_style: str = "linear" | |
optimizer_learning_rate_scheduler_lr_warmup_steps: int = 2000 | |
optimizer_learning_rate_scheduler_lr_warmup_style: str = "linear" | |
optimizer_learning_rate_scheduler_min_decay_lr: float = 0 | |
optimizer_factory_adam_beta1: float = 0.9 | |
optimizer_factory_adam_beta2: float = 0.95 | |
optimizer_factory_adam_eps: float = 1.0e-08 | |
optimizer_factory_name: str = "adamW" | |
optimizer_factory_torch_adam_is_fused: bool = True | |
optimizer_weight_decay: float = 0.01 | |
optimizer_zero_stage: int = 0 | |
optimizer_clip_grad: float = 1.0 |