{ "n_layers": 2, "d_model": 512, "d_mlp": 2048, "d_head": 64, "n_heads": 8, "lr_hidden": 0.004, "lr_vector": 0.001, "batch_size_per_device": 32, "batches_per_step": 1, "seed": 9024, "save_checkpoints": true, "debug": false, "debug_batch": false, "normalization": null, "max_tokens": 12000000000, "version": 188, "use_bfloat16_matmul": true, "n_ctx": 1024, "d_vocab": 50277, "tokenizer_name": "EleutherAI/gpt-neox-20b", "betas": [ 0.9, 0.99 ], "weight_decay": 0.05, "dataset_name": "the_pile", "grad_norm_clip": 1.0, "n_devices": 8, "act_fn": "solu_ln", "shortformer_pos": true, "attn_only": true, "ln_eps": 1e-05, "lr_schedule": "cosine_warmup", "warmup_tokens": 300000000, "train_loss_ewma_beta": 0.99, "truncate_tokens": 1000000000000, "log_interval": 50, "initializer_scale_global": 1.0, "initializer_scale_hidden": 0.02, "initializer_scale_embed": 0.1, "initializer_scale_unembed": 0.02, "neuron_scale": 1.0, "neuron_temp": 1.0, "use_acc": false, "weight_init_scheme": "old", "fixed_init": "", "store_init": true, "control": 1.0, "tokens_per_step": 262144, "batch_size": 256, "max_steps": 45776, "warmup_steps": 1144, "n_params": 1572864 }