penfever commited on
Commit
cff48f4
·
verified ·
1 Parent(s): b661f28

Upload README.md

Browse files
Files changed (1) hide show
  1. README.md +328 -0
README.md ADDED
@@ -0,0 +1,328 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ license: llama3.1
4
+ base_model: meta-llama/Meta-Llama-3.1-8B
5
+ tags:
6
+ - oumi
7
+ - generated_from_trainer
8
+ datasets:
9
+ - HuggingFaceH4/ultrachat_200k
10
+ model-index:
11
+ - name: Llama-3-8B-UltraChat-200K-Oumi
12
+ results: []
13
+ ---
14
+ [<img src="https://github.com/oumi-ai/oumi/blob/main/docs/_static/logo/header_logo.png?raw=true" alt="Built with Oumi" width="200" height="60"/>](https://github.com/oumi-ai/oumi)
15
+ <details><summary>See oumi train config</summary>
16
+
17
+ oumi version: `0.1.3`
18
+ ```yaml
19
+ data:
20
+ train:
21
+ datasets:
22
+ - dataset_name: HuggingFaceH4/ultrachat_200k
23
+ dataset_path: null
24
+ subset: null
25
+ split: train_sft
26
+ dataset_kwargs: {}
27
+ sample_count: null
28
+ mixture_proportion: null
29
+ shuffle: false
30
+ seed: null
31
+ shuffle_buffer_size: 1000
32
+ trust_remote_code: true
33
+ transform_num_workers: null
34
+ collator_name: null
35
+ pack: false
36
+ stream: false
37
+ target_col: null
38
+ mixture_strategy: first_exhausted
39
+ seed: null
40
+ use_async_dataset: false
41
+ use_torchdata: null
42
+ test:
43
+ datasets: []
44
+ collator_name: null
45
+ pack: false
46
+ stream: false
47
+ target_col: null
48
+ mixture_strategy: first_exhausted
49
+ seed: null
50
+ use_async_dataset: false
51
+ use_torchdata: null
52
+ validation:
53
+ datasets: []
54
+ collator_name: null
55
+ pack: false
56
+ stream: false
57
+ target_col: null
58
+ mixture_strategy: first_exhausted
59
+ seed: null
60
+ use_async_dataset: false
61
+ use_torchdata: null
62
+ model:
63
+ model_name: meta-llama/Meta-Llama-3.1-8B
64
+ adapter_model: null
65
+ tokenizer_name: null
66
+ tokenizer_pad_token: null
67
+ tokenizer_kwargs: {}
68
+ model_max_length: 8192
69
+ load_pretrained_weights: true
70
+ trust_remote_code: true
71
+ torch_dtype_str: bfloat16
72
+ compile: false
73
+ chat_template: llama3-instruct
74
+ attn_implementation: flash_attention_2
75
+ device_map: auto
76
+ model_kwargs: {}
77
+ enable_liger_kernel: true
78
+ shard_for_eval: false
79
+ freeze_layers: []
80
+ training:
81
+ use_peft: false
82
+ trainer_type: TRL_SFT
83
+ enable_gradient_checkpointing: true
84
+ gradient_checkpointing_kwargs:
85
+ use_reentrant: false
86
+ output_dir: output/llama8b-ultrachat
87
+ per_device_train_batch_size: 1
88
+ per_device_eval_batch_size: 8
89
+ gradient_accumulation_steps: 8
90
+ max_steps: -1
91
+ num_train_epochs: 1
92
+ save_epoch: false
93
+ save_steps: 800
94
+ save_final_model: true
95
+ seed: 42
96
+ run_name: llama8b-ultrachat.sky-2025-01-30-21-19-10-053582_sky-e018-bf996_1
97
+ metrics_function: null
98
+ log_level: info
99
+ dep_log_level: warning
100
+ enable_wandb: true
101
+ enable_tensorboard: true
102
+ logging_strategy: steps
103
+ logging_dir: null
104
+ logging_steps: 100
105
+ logging_first_step: false
106
+ eval_strategy: 'no'
107
+ eval_steps: 500
108
+ learning_rate: 2.0e-05
109
+ lr_scheduler_type: linear
110
+ lr_scheduler_kwargs: {}
111
+ warmup_ratio: null
112
+ warmup_steps: null
113
+ optimizer: paged_adamw_8bit
114
+ weight_decay: 0.0
115
+ adam_beta1: 0.9
116
+ adam_beta2: 0.999
117
+ adam_epsilon: 1.0e-08
118
+ sgd_momentum: 0.0
119
+ mixed_precision_dtype: NONE
120
+ compile: false
121
+ include_performance_metrics: true
122
+ include_alternative_mfu_metrics: false
123
+ log_model_summary: false
124
+ resume_from_checkpoint: null
125
+ try_resume_from_last_checkpoint: false
126
+ dataloader_num_workers: 8
127
+ dataloader_prefetch_factor: 32
128
+ dataloader_main_process_only: null
129
+ ddp_find_unused_parameters: false
130
+ max_grad_norm: 1.0
131
+ trainer_kwargs:
132
+ max_seq_length: 8192
133
+ profiler:
134
+ save_dir: null
135
+ enable_cpu_profiling: false
136
+ enable_cuda_profiling: false
137
+ record_shapes: false
138
+ profile_memory: false
139
+ with_stack: false
140
+ with_flops: false
141
+ with_modules: false
142
+ row_limit: 50
143
+ schedule:
144
+ enable_schedule: false
145
+ wait: 0
146
+ warmup: 1
147
+ active: 3
148
+ repeat: 1
149
+ skip_first: 1
150
+ telemetry:
151
+ telemetry_dir: telemetry
152
+ collect_telemetry_for_all_ranks: false
153
+ track_gpu_temperature: false
154
+ empty_device_cache_steps: 50
155
+ nccl_default_timeout_minutes: null
156
+ peft:
157
+ lora_r: 8
158
+ lora_alpha: 8
159
+ lora_dropout: 0.0
160
+ lora_target_modules: null
161
+ lora_modules_to_save: null
162
+ lora_bias: none
163
+ lora_init_weights: DEFAULT
164
+ lora_task_type: CAUSAL_LM
165
+ q_lora: false
166
+ q_lora_bits: 4
167
+ bnb_4bit_quant_type: fp4
168
+ use_bnb_nested_quant: false
169
+ bnb_4bit_quant_storage: uint8
170
+ bnb_4bit_compute_dtype: float32
171
+ peft_save_mode: ADAPTER_ONLY
172
+ fsdp:
173
+ enable_fsdp: false
174
+ sharding_strategy: FULL_SHARD
175
+ cpu_offload: false
176
+ mixed_precision: null
177
+ backward_prefetch: BACKWARD_PRE
178
+ forward_prefetch: false
179
+ use_orig_params: null
180
+ state_dict_type: FULL_STATE_DICT
181
+ auto_wrap_policy: NO_WRAP
182
+ min_num_params: 100000
183
+ transformer_layer_cls: null
184
+ sync_module_states: true
185
+ ```
186
+
187
+ </details><br>
188
+
189
+ <details><summary>See oumi cloud config</summary>
190
+
191
+ ```yaml
192
+ name: llama8b-ultrachat-sft
193
+
194
+ num_nodes: 1
195
+ resources:
196
+ cloud: gcp
197
+ accelerators: "A100-80GB:4"
198
+ use_spot: false
199
+ disk_size: 2000 # Disk size in GBs
200
+
201
+ working_dir: .
202
+
203
+ file_mounts:
204
+ ~/.netrc: ~/.netrc # WandB credentials
205
+ # Mount HF token, which is needed to download locked-down models from HF Hub.
206
+ # This is created on the local machine by running `huggingface-cli login`.
207
+ ~/.cache/huggingface/token: ~/.cache/huggingface/token
208
+
209
+ envs:
210
+ WANDB_PROJECT: oumi-train
211
+ OUMI_RUN_NAME: llama8b-ultrachat
212
+ OUMI_USER_NAME: penfever
213
+ ACCELERATE_LOG_LEVEL: info
214
+ # https://github.com/huggingface/tokenizers/issues/899#issuecomment-1027739758
215
+ TOKENIZERS_PARALLELISM: false
216
+ setup: |
217
+ set -e
218
+ pip install uv && uv pip install -e .[gpu,evaluation] hf_transfer
219
+ # Install model from HF Hub. This tool increases download speed compared to
220
+ # downloading the model during training.
221
+ HF_HUB_ENABLE_HF_TRANSFER=1 huggingface-cli download meta-llama/Meta-Llama-3.1-8B --exclude original/*
222
+ pip install -U flash-attn --no-build-isolation
223
+
224
+ run: |
225
+ set -e # Exit if any command failed.
226
+ source ./configs/examples/misc/sky_init.sh
227
+
228
+ set -x
229
+ oumi distributed torchrun \
230
+ -m oumi train \
231
+ -c configs/recipes/llama3_1/sft/8b_full/base_ultrachat.yaml \
232
+ --training.run_name "${OUMI_RUN_NAME}.${SKYPILOT_TASK_ID}" \
233
+
234
+ echo "Node ${SKYPILOT_NODE_RANK} is all done!"
235
+ ```
236
+
237
+ </details><br>
238
+
239
+ # Llama-3-8B-UltraChat-200K-Oumi
240
+
241
+ This model is a fine-tuned version of [meta-llama/Meta-Llama-3.1-8B](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B) on the HuggingFaceH4/ultrachat_200k dataset. It achieves a training loss of 1.0435.
242
+
243
+ ## Model description
244
+
245
+ This model was trained as a partial reproduction of results from the recent [`WildChat-50M` paper](https://arxiv.org/abs/2501.18511).
246
+
247
+ ```bibtex
248
+ @misc{feuer2025wildchat50mdeepdiverole,
249
+ title={WILDCHAT-50M: A Deep Dive Into the Role of Synthetic Data in Post-Training},
250
+ author={Benjamin Feuer and Chinmay Hegde},
251
+ year={2025},
252
+ eprint={2501.18511},
253
+ archivePrefix={arXiv},
254
+ primaryClass={cs.LG},
255
+ url={https://arxiv.org/abs/2501.18511},
256
+ }
257
+ ```
258
+
259
+ ## Intended uses & limitations
260
+
261
+ This model is intended for research use; it has not received any safety oriented post-training.
262
+
263
+ ## Artifacts
264
+
265
+ The following is a list of artifacts which may be present in this repository, as well as brief descriptions of what they contain.
266
+
267
+ ### Logs
268
+
269
+ Contains logs from the training process, one for each rank.
270
+
271
+ ### Telemetry
272
+
273
+ `devices_info.txt`: A file containing information about the devices used to train the model.
274
+
275
+ `telemetry_callback_metrics.json`: File containing metrics from the training process such as loss and number of tokens seen.
276
+
277
+ `telemetry_callback_wandb.json`: File containing weights and biases parameters.
278
+
279
+ `telemetry_callback.json`: File containing metadata such as time to train and number of epochs trained.
280
+
281
+ `training_config.yaml`: File containing the training configuration used to train the model (also found in this README)
282
+
283
+ `world_size.json`: File containing the world size used to train the model.
284
+
285
+ ## Datasets
286
+
287
+ Summary statistics about the datasets used to train this model.
288
+
289
+ ### HuggingFaceH4/ultrachat_200k
290
+
291
+ `Split`: train_sft
292
+
293
+ `Version`: 0.0.0
294
+
295
+ `Dataset size`: 3047427114 bytes
296
+
297
+ `Download size`: 1624049723 bytes
298
+
299
+ `Size`: 4671476837 bytes
300
+
301
+ `Rows`: 207865
302
+
303
+ `Columns`: ['prompt', 'prompt_id', 'messages']
304
+
305
+ ## Results
306
+
307
+ ### Training Loss
308
+
309
+ | Training Loss | Epoch | Tokens Seen |
310
+ |:-------------:|:------:|:----:|
311
+ | 1.043 | 0.999 | 246 Mn |
312
+
313
+ ### Evaluation
314
+
315
+ Following the paper, our benchmark results are reported using [Evalchemy](https://github.com/mlfoundations/evalchemy/). For more details on the evaluation metrics, please refer to the [paper](https://arxiv.org/abs/2501.18511). We compare to [this baseline model](https://huggingface.co/tanliboy/zephyr-llama-3-8b-sft) used in the paper.
316
+
317
+ | Metric | Oumi Repro | Baseline |
318
+ |---------|--------|----------|
319
+ | MTBench | 5.2313 | 5.0187 |
320
+ | Alpaca Eval (LC) | 1.6157 | 4.1260 |
321
+ | BBH | 0.4861 | 0.4845 |
322
+ | GPQA | 0.2903 | 0.3204 |
323
+ | MATH | 0.0552 | 0.0458 |
324
+ | MUSR | 0.4116 | 0.3917 |
325
+ | IFEval (Prompt Level, Strict) | 0.1978 | 0.2643 |
326
+ | MMLU Pro | 0.3118 | 0.3198 |
327
+ | MixEval | 0.5935 | 0.63 |
328
+ | Average | 0.321 | 0.333 |