seed: 123 | |
### model | |
model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct | |
trust_remote_code: true | |
flash_attn: auto | |
use_cache: false | |
### method | |
stage: sft | |
do_train: true | |
finetuning_type: prompt-tuning | |
task_type: CAUSAL_LM | |
num_virtual_tokens: 100 | |
### dataset | |
dataset: cola | |
template: llama3 | |
cutoff_len: 2048 | |
overwrite_cache: true | |
preprocessing_num_workers: 16 | |
dataloader_num_workers: 4 | |
packing: false | |
### output | |
output_dir: saves/prompt-tuning/llama-3-8b-instruct/train_cola_1752763927 | |
logging_steps: 5 | |
save_steps: 0.05 | |
overwrite_output_dir: true | |
save_only_model: false | |
plot_loss: true | |
include_num_input_tokens_seen: true | |
push_to_hub: true | |
push_to_hub_organization: rbelanec | |
load_best_model_at_end: true | |
save_total_limit: 1 | |
### train | |
per_device_train_batch_size: 8 | |
learning_rate: 5.0e-5 | |
num_train_epochs: 10.0 | |
weight_decay: 1.0e-5 | |
lr_scheduler_type: cosine | |
bf16: true | |
ddp_timeout: 180000000 | |
resume_from_checkpoint: null | |
warmup_ratio: 0.1 | |
optim: adamw_torch | |
report_to: | |
- wandb | |
### eval | |
per_device_eval_batch_size: 8 | |
eval_strategy: steps | |
eval_steps: 0.05 | |
val_size: 0.1 | |