Upload README.md
Browse files
README.md
ADDED
@@ -0,0 +1,328 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
library_name: transformers
|
3 |
+
license: llama3.1
|
4 |
+
base_model: meta-llama/Meta-Llama-3.1-8B
|
5 |
+
tags:
|
6 |
+
- oumi
|
7 |
+
- generated_from_trainer
|
8 |
+
datasets:
|
9 |
+
- HuggingFaceH4/ultrachat_200k
|
10 |
+
model-index:
|
11 |
+
- name: Llama-3-8B-UltraChat-200K-Oumi
|
12 |
+
results: []
|
13 |
+
---
|
14 |
+
[<img src="https://github.com/oumi-ai/oumi/blob/main/docs/_static/logo/header_logo.png?raw=true" alt="Built with Oumi" width="200" height="60"/>](https://github.com/oumi-ai/oumi)
|
15 |
+
<details><summary>See oumi train config</summary>
|
16 |
+
|
17 |
+
oumi version: `0.1.3`
|
18 |
+
```yaml
|
19 |
+
data:
|
20 |
+
train:
|
21 |
+
datasets:
|
22 |
+
- dataset_name: HuggingFaceH4/ultrachat_200k
|
23 |
+
dataset_path: null
|
24 |
+
subset: null
|
25 |
+
split: train_sft
|
26 |
+
dataset_kwargs: {}
|
27 |
+
sample_count: null
|
28 |
+
mixture_proportion: null
|
29 |
+
shuffle: false
|
30 |
+
seed: null
|
31 |
+
shuffle_buffer_size: 1000
|
32 |
+
trust_remote_code: true
|
33 |
+
transform_num_workers: null
|
34 |
+
collator_name: null
|
35 |
+
pack: false
|
36 |
+
stream: false
|
37 |
+
target_col: null
|
38 |
+
mixture_strategy: first_exhausted
|
39 |
+
seed: null
|
40 |
+
use_async_dataset: false
|
41 |
+
use_torchdata: null
|
42 |
+
test:
|
43 |
+
datasets: []
|
44 |
+
collator_name: null
|
45 |
+
pack: false
|
46 |
+
stream: false
|
47 |
+
target_col: null
|
48 |
+
mixture_strategy: first_exhausted
|
49 |
+
seed: null
|
50 |
+
use_async_dataset: false
|
51 |
+
use_torchdata: null
|
52 |
+
validation:
|
53 |
+
datasets: []
|
54 |
+
collator_name: null
|
55 |
+
pack: false
|
56 |
+
stream: false
|
57 |
+
target_col: null
|
58 |
+
mixture_strategy: first_exhausted
|
59 |
+
seed: null
|
60 |
+
use_async_dataset: false
|
61 |
+
use_torchdata: null
|
62 |
+
model:
|
63 |
+
model_name: meta-llama/Meta-Llama-3.1-8B
|
64 |
+
adapter_model: null
|
65 |
+
tokenizer_name: null
|
66 |
+
tokenizer_pad_token: null
|
67 |
+
tokenizer_kwargs: {}
|
68 |
+
model_max_length: 8192
|
69 |
+
load_pretrained_weights: true
|
70 |
+
trust_remote_code: true
|
71 |
+
torch_dtype_str: bfloat16
|
72 |
+
compile: false
|
73 |
+
chat_template: llama3-instruct
|
74 |
+
attn_implementation: flash_attention_2
|
75 |
+
device_map: auto
|
76 |
+
model_kwargs: {}
|
77 |
+
enable_liger_kernel: true
|
78 |
+
shard_for_eval: false
|
79 |
+
freeze_layers: []
|
80 |
+
training:
|
81 |
+
use_peft: false
|
82 |
+
trainer_type: TRL_SFT
|
83 |
+
enable_gradient_checkpointing: true
|
84 |
+
gradient_checkpointing_kwargs:
|
85 |
+
use_reentrant: false
|
86 |
+
output_dir: output/llama8b-ultrachat
|
87 |
+
per_device_train_batch_size: 1
|
88 |
+
per_device_eval_batch_size: 8
|
89 |
+
gradient_accumulation_steps: 8
|
90 |
+
max_steps: -1
|
91 |
+
num_train_epochs: 1
|
92 |
+
save_epoch: false
|
93 |
+
save_steps: 800
|
94 |
+
save_final_model: true
|
95 |
+
seed: 42
|
96 |
+
run_name: llama8b-ultrachat.sky-2025-01-30-21-19-10-053582_sky-e018-bf996_1
|
97 |
+
metrics_function: null
|
98 |
+
log_level: info
|
99 |
+
dep_log_level: warning
|
100 |
+
enable_wandb: true
|
101 |
+
enable_tensorboard: true
|
102 |
+
logging_strategy: steps
|
103 |
+
logging_dir: null
|
104 |
+
logging_steps: 100
|
105 |
+
logging_first_step: false
|
106 |
+
eval_strategy: 'no'
|
107 |
+
eval_steps: 500
|
108 |
+
learning_rate: 2.0e-05
|
109 |
+
lr_scheduler_type: linear
|
110 |
+
lr_scheduler_kwargs: {}
|
111 |
+
warmup_ratio: null
|
112 |
+
warmup_steps: null
|
113 |
+
optimizer: paged_adamw_8bit
|
114 |
+
weight_decay: 0.0
|
115 |
+
adam_beta1: 0.9
|
116 |
+
adam_beta2: 0.999
|
117 |
+
adam_epsilon: 1.0e-08
|
118 |
+
sgd_momentum: 0.0
|
119 |
+
mixed_precision_dtype: NONE
|
120 |
+
compile: false
|
121 |
+
include_performance_metrics: true
|
122 |
+
include_alternative_mfu_metrics: false
|
123 |
+
log_model_summary: false
|
124 |
+
resume_from_checkpoint: null
|
125 |
+
try_resume_from_last_checkpoint: false
|
126 |
+
dataloader_num_workers: 8
|
127 |
+
dataloader_prefetch_factor: 32
|
128 |
+
dataloader_main_process_only: null
|
129 |
+
ddp_find_unused_parameters: false
|
130 |
+
max_grad_norm: 1.0
|
131 |
+
trainer_kwargs:
|
132 |
+
max_seq_length: 8192
|
133 |
+
profiler:
|
134 |
+
save_dir: null
|
135 |
+
enable_cpu_profiling: false
|
136 |
+
enable_cuda_profiling: false
|
137 |
+
record_shapes: false
|
138 |
+
profile_memory: false
|
139 |
+
with_stack: false
|
140 |
+
with_flops: false
|
141 |
+
with_modules: false
|
142 |
+
row_limit: 50
|
143 |
+
schedule:
|
144 |
+
enable_schedule: false
|
145 |
+
wait: 0
|
146 |
+
warmup: 1
|
147 |
+
active: 3
|
148 |
+
repeat: 1
|
149 |
+
skip_first: 1
|
150 |
+
telemetry:
|
151 |
+
telemetry_dir: telemetry
|
152 |
+
collect_telemetry_for_all_ranks: false
|
153 |
+
track_gpu_temperature: false
|
154 |
+
empty_device_cache_steps: 50
|
155 |
+
nccl_default_timeout_minutes: null
|
156 |
+
peft:
|
157 |
+
lora_r: 8
|
158 |
+
lora_alpha: 8
|
159 |
+
lora_dropout: 0.0
|
160 |
+
lora_target_modules: null
|
161 |
+
lora_modules_to_save: null
|
162 |
+
lora_bias: none
|
163 |
+
lora_init_weights: DEFAULT
|
164 |
+
lora_task_type: CAUSAL_LM
|
165 |
+
q_lora: false
|
166 |
+
q_lora_bits: 4
|
167 |
+
bnb_4bit_quant_type: fp4
|
168 |
+
use_bnb_nested_quant: false
|
169 |
+
bnb_4bit_quant_storage: uint8
|
170 |
+
bnb_4bit_compute_dtype: float32
|
171 |
+
peft_save_mode: ADAPTER_ONLY
|
172 |
+
fsdp:
|
173 |
+
enable_fsdp: false
|
174 |
+
sharding_strategy: FULL_SHARD
|
175 |
+
cpu_offload: false
|
176 |
+
mixed_precision: null
|
177 |
+
backward_prefetch: BACKWARD_PRE
|
178 |
+
forward_prefetch: false
|
179 |
+
use_orig_params: null
|
180 |
+
state_dict_type: FULL_STATE_DICT
|
181 |
+
auto_wrap_policy: NO_WRAP
|
182 |
+
min_num_params: 100000
|
183 |
+
transformer_layer_cls: null
|
184 |
+
sync_module_states: true
|
185 |
+
```
|
186 |
+
|
187 |
+
</details><br>
|
188 |
+
|
189 |
+
<details><summary>See oumi cloud config</summary>
|
190 |
+
|
191 |
+
```yaml
|
192 |
+
name: llama8b-ultrachat-sft
|
193 |
+
|
194 |
+
num_nodes: 1
|
195 |
+
resources:
|
196 |
+
cloud: gcp
|
197 |
+
accelerators: "A100-80GB:4"
|
198 |
+
use_spot: false
|
199 |
+
disk_size: 2000 # Disk size in GBs
|
200 |
+
|
201 |
+
working_dir: .
|
202 |
+
|
203 |
+
file_mounts:
|
204 |
+
~/.netrc: ~/.netrc # WandB credentials
|
205 |
+
# Mount HF token, which is needed to download locked-down models from HF Hub.
|
206 |
+
# This is created on the local machine by running `huggingface-cli login`.
|
207 |
+
~/.cache/huggingface/token: ~/.cache/huggingface/token
|
208 |
+
|
209 |
+
envs:
|
210 |
+
WANDB_PROJECT: oumi-train
|
211 |
+
OUMI_RUN_NAME: llama8b-ultrachat
|
212 |
+
OUMI_USER_NAME: penfever
|
213 |
+
ACCELERATE_LOG_LEVEL: info
|
214 |
+
# https://github.com/huggingface/tokenizers/issues/899#issuecomment-1027739758
|
215 |
+
TOKENIZERS_PARALLELISM: false
|
216 |
+
setup: |
|
217 |
+
set -e
|
218 |
+
pip install uv && uv pip install -e .[gpu,evaluation] hf_transfer
|
219 |
+
# Install model from HF Hub. This tool increases download speed compared to
|
220 |
+
# downloading the model during training.
|
221 |
+
HF_HUB_ENABLE_HF_TRANSFER=1 huggingface-cli download meta-llama/Meta-Llama-3.1-8B --exclude original/*
|
222 |
+
pip install -U flash-attn --no-build-isolation
|
223 |
+
|
224 |
+
run: |
|
225 |
+
set -e # Exit if any command failed.
|
226 |
+
source ./configs/examples/misc/sky_init.sh
|
227 |
+
|
228 |
+
set -x
|
229 |
+
oumi distributed torchrun \
|
230 |
+
-m oumi train \
|
231 |
+
-c configs/recipes/llama3_1/sft/8b_full/base_ultrachat.yaml \
|
232 |
+
--training.run_name "${OUMI_RUN_NAME}.${SKYPILOT_TASK_ID}" \
|
233 |
+
|
234 |
+
echo "Node ${SKYPILOT_NODE_RANK} is all done!"
|
235 |
+
```
|
236 |
+
|
237 |
+
</details><br>
|
238 |
+
|
239 |
+
# Llama-3-8B-UltraChat-200K-Oumi
|
240 |
+
|
241 |
+
This model is a fine-tuned version of [meta-llama/Meta-Llama-3.1-8B](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B) on the HuggingFaceH4/ultrachat_200k dataset. It achieves a training loss of 1.0435.
|
242 |
+
|
243 |
+
## Model description
|
244 |
+
|
245 |
+
This model was trained as a partial reproduction of results from the recent [`WildChat-50M` paper](https://arxiv.org/abs/2501.18511).
|
246 |
+
|
247 |
+
```bibtex
|
248 |
+
@misc{feuer2025wildchat50mdeepdiverole,
|
249 |
+
title={WILDCHAT-50M: A Deep Dive Into the Role of Synthetic Data in Post-Training},
|
250 |
+
author={Benjamin Feuer and Chinmay Hegde},
|
251 |
+
year={2025},
|
252 |
+
eprint={2501.18511},
|
253 |
+
archivePrefix={arXiv},
|
254 |
+
primaryClass={cs.LG},
|
255 |
+
url={https://arxiv.org/abs/2501.18511},
|
256 |
+
}
|
257 |
+
```
|
258 |
+
|
259 |
+
## Intended uses & limitations
|
260 |
+
|
261 |
+
This model is intended for research use; it has not received any safety oriented post-training.
|
262 |
+
|
263 |
+
## Artifacts
|
264 |
+
|
265 |
+
The following is a list of artifacts which may be present in this repository, as well as brief descriptions of what they contain.
|
266 |
+
|
267 |
+
### Logs
|
268 |
+
|
269 |
+
Contains logs from the training process, one for each rank.
|
270 |
+
|
271 |
+
### Telemetry
|
272 |
+
|
273 |
+
`devices_info.txt`: A file containing information about the devices used to train the model.
|
274 |
+
|
275 |
+
`telemetry_callback_metrics.json`: File containing metrics from the training process such as loss and number of tokens seen.
|
276 |
+
|
277 |
+
`telemetry_callback_wandb.json`: File containing weights and biases parameters.
|
278 |
+
|
279 |
+
`telemetry_callback.json`: File containing metadata such as time to train and number of epochs trained.
|
280 |
+
|
281 |
+
`training_config.yaml`: File containing the training configuration used to train the model (also found in this README)
|
282 |
+
|
283 |
+
`world_size.json`: File containing the world size used to train the model.
|
284 |
+
|
285 |
+
## Datasets
|
286 |
+
|
287 |
+
Summary statistics about the datasets used to train this model.
|
288 |
+
|
289 |
+
### HuggingFaceH4/ultrachat_200k
|
290 |
+
|
291 |
+
`Split`: train_sft
|
292 |
+
|
293 |
+
`Version`: 0.0.0
|
294 |
+
|
295 |
+
`Dataset size`: 3047427114 bytes
|
296 |
+
|
297 |
+
`Download size`: 1624049723 bytes
|
298 |
+
|
299 |
+
`Size`: 4671476837 bytes
|
300 |
+
|
301 |
+
`Rows`: 207865
|
302 |
+
|
303 |
+
`Columns`: ['prompt', 'prompt_id', 'messages']
|
304 |
+
|
305 |
+
## Results
|
306 |
+
|
307 |
+
### Training Loss
|
308 |
+
|
309 |
+
| Training Loss | Epoch | Tokens Seen |
|
310 |
+
|:-------------:|:------:|:----:|
|
311 |
+
| 1.043 | 0.999 | 246 Mn |
|
312 |
+
|
313 |
+
### Evaluation
|
314 |
+
|
315 |
+
Following the paper, our benchmark results are reported using [Evalchemy](https://github.com/mlfoundations/evalchemy/). For more details on the evaluation metrics, please refer to the [paper](https://arxiv.org/abs/2501.18511). We compare to [this baseline model](https://huggingface.co/tanliboy/zephyr-llama-3-8b-sft) used in the paper.
|
316 |
+
|
317 |
+
| Metric | Oumi Repro | Baseline |
|
318 |
+
|---------|--------|----------|
|
319 |
+
| MTBench | 5.2313 | 5.0187 |
|
320 |
+
| Alpaca Eval (LC) | 1.6157 | 4.1260 |
|
321 |
+
| BBH | 0.4861 | 0.4845 |
|
322 |
+
| GPQA | 0.2903 | 0.3204 |
|
323 |
+
| MATH | 0.0552 | 0.0458 |
|
324 |
+
| MUSR | 0.4116 | 0.3917 |
|
325 |
+
| IFEval (Prompt Level, Strict) | 0.1978 | 0.2643 |
|
326 |
+
| MMLU Pro | 0.3118 | 0.3198 |
|
327 |
+
| MixEval | 0.5935 | 0.63 |
|
328 |
+
| Average | 0.321 | 0.333 |
|