craffel HF Staff commited on
Commit
abb5e39
·
verified ·
1 Parent(s): 4105bed

Upload google-byt5-small/config.yaml with huggingface_hub

Browse files
Files changed (1) hide show
  1. google-byt5-small/config.yaml +126 -0
google-byt5-small/config.yaml ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: google-byt5-small
2
+ dump_dir: /fsx/craffel/toksuite/lingua_logs/google-byt5-small/
3
+ seed: 777
4
+ grad_acc_steps: 8
5
+ gc_collect_freq: 1000
6
+ probe_freq: null
7
+ steps: 100000
8
+ data:
9
+ root_dir: /scratch/craffel/lingua/data/tokenizer_training/
10
+ sources:
11
+ fw_edu: 0.4
12
+ cmn_Hani: 0.15
13
+ tur_Latn: 0.15
14
+ ita_Latn: 0.15
15
+ fas_Arab: 0.15
16
+ batch_size: 4
17
+ seq_len: 4096
18
+ n_views: 2
19
+ seed: 42
20
+ add_bos: true
21
+ add_eos: true
22
+ load_async: true
23
+ prefetch_size: 1024
24
+ tokenizer:
25
+ name: huggingface
26
+ path: google/byt5-small
27
+ n_words: null
28
+ optim:
29
+ lr: 0.001
30
+ weight_decay: 0.1
31
+ epsilon: 1.0e-08
32
+ beta1: 0.9
33
+ beta2: 0.95
34
+ clip: 1.0
35
+ scheduler: cosine
36
+ warmup: 2000
37
+ lr_min_ratio: 1.0e-06
38
+ cycle_length: 1.0
39
+ cosine_theta: 1.0
40
+ annealing_step: 1000
41
+ decay_fraction: 0.1
42
+ exp_factor: 0.5
43
+ model:
44
+ dim: 2048
45
+ n_layers: 25
46
+ head_dim: null
47
+ n_heads: 16
48
+ n_kv_heads: null
49
+ ffn_dim_multiplier: null
50
+ multiple_of: 256
51
+ norm_eps: 1.0e-05
52
+ rope_theta: 10000.0
53
+ init_base_std: null
54
+ init_std_factor: disabled
55
+ max_seqlen: 4096
56
+ seed: 42
57
+ vocab_size: 256
58
+ weight_tying: false
59
+ sliding_window: null
60
+ distributed:
61
+ dp_shard: 1
62
+ dp_replicate: 8
63
+ tp_size: 1
64
+ selective_activation_checkpointing: false
65
+ compile: true
66
+ fsdp_type: full_shard
67
+ model_dtype: bf16
68
+ float8_recipe: null
69
+ float8_filter: layers\.[0-9]+\.
70
+ matmul_allow_tf32: false
71
+ detect_anomaly: false
72
+ compile_cache_size_limit: 8
73
+ spawn_method: forkserver
74
+ env:
75
+ MKL_SERVICE_FORCE_INTEL: GNU
76
+ OMP_NUM_THREADS: '1'
77
+ MKL_NUM_THREADS: '1'
78
+ ENABLE_INTRA_NODE_COMM: '1'
79
+ TORCH_NCCL_AVOID_RECORD_STREAMS: '1'
80
+ NCCL_IB_TIMEOUT: '22'
81
+ NCCL_DEBUG: INFO
82
+ TORCH_NCCL_ASYNC_ERROR_HANDLING: '1'
83
+ checkpoint:
84
+ dump:
85
+ every: 10000
86
+ keep: -1
87
+ eval:
88
+ every: 10000
89
+ keep: -1
90
+ path: /fsx/craffel/toksuite/lingua_logs/google-byt5-small/checkpoints
91
+ init_ckpt_path: /fsx/craffel/toksuite/init_checkpoints/google-byt5-small/model_dcp
92
+ load_init_optimizer_state: false
93
+ save_init_ckpt: false
94
+ profiling:
95
+ run: true
96
+ trace_folder: profiling
97
+ mem_warmup: 0
98
+ mem_steps: 4
99
+ profile_warmup: 100
100
+ profile_steps: 4
101
+ logging:
102
+ freq: 1
103
+ acc_freq: null
104
+ wandb: null
105
+ async_eval_gpus: 8
106
+ eval:
107
+ harness:
108
+ tasks:
109
+ - hellaswag
110
+ - piqa
111
+ - arc_easy
112
+ - arc_challenge
113
+ - include_base_44_turkish
114
+ - include_base_44_italian
115
+ - include_base_44_chinese
116
+ - belebele_pes_Arab
117
+ - belebele_eng_Latn
118
+ - belebele_ita_Latn
119
+ - belebele_tur_Latn
120
+ - belebele_zho_Hans
121
+ - xnli_en
122
+ - xnli_tr
123
+ - xnli_zh
124
+ generator:
125
+ max_tokens: 8192
126
+ dtype: bf16