File size: 2,626 Bytes
eefd238 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 |
name: common-pile-comma-v0.1
dump_dir: /fsx/craffel/toksuite/lingua_logs/common-pile-comma-v0.1/
seed: 777
grad_acc_steps: 8
gc_collect_freq: 1000
probe_freq: null
steps: 100000
data:
root_dir: /scratch/craffel/lingua/data/tokenizer_training/
sources:
fw_edu: 0.4
cmn_Hani: 0.15
tur_Latn: 0.15
ita_Latn: 0.15
fas_Arab: 0.15
batch_size: 4
seq_len: 4096
n_views: 2
seed: 42
add_bos: true
add_eos: true
load_async: true
prefetch_size: 1024
tokenizer:
name: huggingface
path: common-pile/comma-v0.1
n_words: null
optim:
lr: 0.001
weight_decay: 0.1
epsilon: 1.0e-08
beta1: 0.9
beta2: 0.95
clip: 1.0
scheduler: cosine
warmup: 2000
lr_min_ratio: 1.0e-06
cycle_length: 1.0
cosine_theta: 1.0
annealing_step: 1000
decay_fraction: 0.1
exp_factor: 0.5
model:
dim: 2048
n_layers: 25
head_dim: null
n_heads: 16
n_kv_heads: null
ffn_dim_multiplier: null
multiple_of: 256
norm_eps: 1.0e-05
rope_theta: 10000.0
init_base_std: null
init_std_factor: disabled
max_seqlen: 4096
seed: 42
vocab_size: 64000
weight_tying: false
sliding_window: null
distributed:
dp_shard: 1
dp_replicate: 8
tp_size: 1
selective_activation_checkpointing: false
compile: true
fsdp_type: full_shard
model_dtype: bf16
float8_recipe: null
float8_filter: layers\.[0-9]+\.
matmul_allow_tf32: false
detect_anomaly: false
compile_cache_size_limit: 8
spawn_method: forkserver
env:
MKL_SERVICE_FORCE_INTEL: GNU
OMP_NUM_THREADS: '1'
MKL_NUM_THREADS: '1'
ENABLE_INTRA_NODE_COMM: '1'
TORCH_NCCL_AVOID_RECORD_STREAMS: '1'
NCCL_IB_TIMEOUT: '22'
NCCL_DEBUG: INFO
TORCH_NCCL_ASYNC_ERROR_HANDLING: '1'
checkpoint:
dump:
every: 6000
keep: -1
eval:
every: 2000
keep: -1
path: /fsx/craffel/toksuite/lingua_logs/common-pile-comma-v0.1/checkpoints
init_ckpt_path: /fsx/craffel/toksuite/init_checkpoints/common-pile-comma-v0.1/model_dcp
load_init_optimizer_state: false
save_init_ckpt: false
profiling:
run: true
trace_folder: profiling
mem_warmup: 0
mem_steps: 4
profile_warmup: 100
profile_steps: 4
logging:
freq: 1
acc_freq: null
wandb: null
async_eval_gpus: 8
eval:
harness:
tasks:
- hellaswag
- piqa
- arc_easy
- arc_challenge
- include_base_44_turkish
- include_base_44_italian
- include_base_44_chinese
- belebele_pes_Arab
- belebele_eng_Latn
- belebele_ita_Latn
- belebele_tur_Latn
- belebele_zho_Hans
- xnli_en
- xnli_tr
- xnli_zh
confirm_run_unsafe_code: true
generator:
max_tokens: 8192
dtype: bf16
|