|
{ |
|
"model_name_or_path": "princeton-nlp/Sheared-LLaMA-1.3B", |
|
"dataset_name": "wikitext", |
|
"dataset_config_name": "wikitext-103-raw-v1", |
|
"per_device_train_batch_size": 32, |
|
"per_device_eval_batch_size": 32, |
|
"gradient_accumulation_steps": 1, |
|
"do_train": true, |
|
"do_eval": true, |
|
"max_seq_length": 512, |
|
"mask_token_type": "blank", |
|
"data_collator_type": "default", |
|
"mlm_probability": 0.2, |
|
"overwrite_output_dir": true, |
|
"output_dir": "output/mntp/Sheared-LLaMA-1.3B", |
|
"evaluation_strategy": "steps", |
|
"eval_steps": 100, |
|
"save_steps": 200, |
|
"stop_after_n_steps": 1000, |
|
"lora_r": 16, |
|
"gradient_checkpointing": true, |
|
"torch_dtype": "bfloat16", |
|
"attn_implementation": "flash_attention_2" |
|
} |