|
{ |
|
"n_layers": 2, |
|
"d_model": 512, |
|
"d_mlp": 2048, |
|
"d_head": 64, |
|
"n_heads": 8, |
|
"lr_hidden": 0.004, |
|
"lr_vector": 0.001, |
|
"batch_size_per_device": 32, |
|
"batches_per_step": 1, |
|
"seed": 9024, |
|
"save_checkpoints": true, |
|
"debug": false, |
|
"debug_batch": false, |
|
"normalization": null, |
|
"max_tokens": 12000000000, |
|
"version": 188, |
|
"use_bfloat16_matmul": true, |
|
"n_ctx": 1024, |
|
"d_vocab": 50277, |
|
"tokenizer_name": "EleutherAI/gpt-neox-20b", |
|
"betas": [ |
|
0.9, |
|
0.99 |
|
], |
|
"weight_decay": 0.05, |
|
"dataset_name": "the_pile", |
|
"grad_norm_clip": 1.0, |
|
"n_devices": 8, |
|
"act_fn": "solu_ln", |
|
"shortformer_pos": true, |
|
"attn_only": true, |
|
"ln_eps": 1e-05, |
|
"lr_schedule": "cosine_warmup", |
|
"warmup_tokens": 300000000, |
|
"train_loss_ewma_beta": 0.99, |
|
"truncate_tokens": 1000000000000, |
|
"log_interval": 50, |
|
"initializer_scale_global": 1.0, |
|
"initializer_scale_hidden": 0.02, |
|
"initializer_scale_embed": 0.1, |
|
"initializer_scale_unembed": 0.02, |
|
"neuron_scale": 1.0, |
|
"neuron_temp": 1.0, |
|
"use_acc": false, |
|
"weight_init_scheme": "old", |
|
"fixed_init": "", |
|
"store_init": true, |
|
"control": 1.0, |
|
"tokens_per_step": 262144, |
|
"batch_size": 256, |
|
"max_steps": 45776, |
|
"warmup_steps": 1144, |
|
"n_params": 1572864 |
|
} |