--- license: apache-2.0 datasets: - EleutherAI/the_pile_deduplicated language: - en --- Pythia-2.8B Deduped 4K is a [Pythia-2.8B Deduped](https://huggingface.co/EleutherAI/pythia-2.8b-deduped) model fine-tuned with a 4096 context length. Training resumed from their 143,000 step checkpoint and continued on The Pile v1 Deduped (threshold=0.87). This particular model is from a checkpoint captured at step 175,500 for an extra 134,217,728,000 tokens of training. Note: Sequence length warmup was not used to move up from 2048 but, in hindsight, should have been applied. ## Config ```yaml { # 8 Nodes 8xA100 40GB "eval_batch_size": 2, "pipe-parallel-size": 1, "model-parallel-size": 1, "num-layers": 32, "hidden-size": 2560, "num-attention-heads": 32, "seq-length": 4096, "max-position-embeddings": 4096, "norm": "layernorm", "pos-emb": "rotary", "rotary-pct": 0.25, "no-weight-tying": true, "gpt-j-residual": true, "output-layer-parallelism": "column", "init_method": "small_init", "output_layer_init_method": "wang_init", "attention-config": [[["flash"], 32]], "scaled-upper-triang-masked-softmax-fusion": true, "bias-gelu-fusion": true, "optimizer": { "type": "Adam", "params": { "lr": 1.6e-5, "betas": [0.9, 0.95], "eps": 1.0e-08 }, }, "min_lr": 8.0e-06, "zero_optimization":{ "stage": 1, "allgather_partitions": true, "allgather_bucket_size": 500000000, "overlap_comm": true, "reduce_scatter": true, "reduce_bucket_size": 500000000, "contiguous_gradients": true, "cpu_offload": false, }, "train_micro_batch_size_per_gpu": 4, "gradient-accumulation-steps": 4, "data-impl": "mmap", "checkpoint-activations": true, "checkpoint-num-layers": 1, "partition-activations": true, "synchronize-each-layer": true, "gradient_clipping": 1.0, "weight-decay": 0.1, "hidden-dropout": 0, "attention-dropout": 0, "fp16": { "fp16": true, "enabled": true, "loss_scale": 0, "loss_scale_window": 1000, "initial_scale_power": 12, "hysteresis": 2, "min_loss_scale": 1, }, "train-iters": 318000, "lr-decay-iters": 318000, "distributed-backend": "nccl", "lr-decay-style": "cosine", "warmup": 0.01, "checkpoint-factor": 500, "eval-interval": 50000, "eval-iters": 10, "extra-save-iters": [0, 512, 152001], "train-data-paths": ["pile_0.87_deduped_text_document"], "valid-data-paths": ["pile_0.87_deduped_text_document"], "test-data-paths": ["pile_0.87_deduped_text_document"], "tokenizer_type": "HFTokenizer", "vocab-file": "20B_tokenizer.json", "log-interval": 10, "steps_per_print": 10, "wall_clock_breakdown": true, "log-grad-norm": true, "launcher": "slurm", "deepspeed_slurm": true, } ``` ## Acknoweldgements This work would not have been possible without the support of [Stability AI](https://stability.ai/).