File size: 9,547 Bytes
5c20d4e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 |
echo 'python run_speech_recognition_seq2seq_streaming.py \ --model_name_or_path="openai/whisper-small" \ --dataset_name="mozilla-foundation/common_voice_11_0" \ --dataset_config_name="bn" \ --language="bengali" \ --train_split_name="train+validation" \ --eval_split_name="test" \ --model_index_name="Whisper Small Bengali" \ --output_dir="./" \ --overwrite_output_dir \ --max_steps="60000" \ --per_device_train_batch_size="4" \ --per_device_eval_batch_size="2" \ --gradient_accumulation_steps="8" \ --gradient_checkpointing="False" \ --evaluation_strategy="steps" \ --eval_steps="1000" \ --save_strategy="steps" \ --save_steps="1000" \ --save_total_limit="5" \ --learning_rate="1e-5" \ --warmup_steps="5000" \ --logging_steps="25" \ --weight_decay="0.01" \ --load_best_model_at_end="True" \ --metric_for_best_model="wer" \ --greater_is_better="False" \ --bf16="True" \ --tf32="True" \ --streaming="False" \ --generation_max_length="225" \ --length_column_name="input_length" \ --max_duration_in_seconds="30" \ --text_column_name="sentence" \ --freeze_feature_encoder="False" \ --report_to="tensorboard" \ --do_train \ --do_eval \ --predict_with_generate \ --do_normalize_eval \ --use_auth_token \ --push_to_hub' >> run.sh #max_steps MAX_STEPS - If > 0: set total number of training steps to perform. Override num_train_epochs. (default: -1) --max_steps="20000" \ #output_dir OUTPUT_DIR - The output directory where the model predictions and checkpoints will be written. (default: None) --output_dir="./" \ #overwrite_output_dir [OVERWRITE_OUTPUT_DIR] - Overwrite the content of the output directory. Use this to continue training if output_dir points to a #checkpoint directory. (default: False) --overwrite_output_dir \ #weight_decay (float, optional, defaults to 0) — The weight decay to apply (if not zero) to all layers except all bias and LayerNorm weights in AdamW optimizer. weight decay prevents overfitting. visit: https://towardsdatascience.com/this-thing-called-weight-decay-a7cd4bcfccab # 0.1-just right, 0.01-takes more epochs to fit, 10-never quite fits --weight_decay="0.01" \ #bf16 (bool, optional, defaults to False) — Whether to use bf16 16-bit (mixed) precision training instead of 32-bit training. Requires Ampere or higher NVIDIA architecture or using CPU (no_cuda). This is an experimental API and it may change. --bf16="True" \ #fp16 (bool, optional, defaults to False) — Whether to use fp16 16-bit (mixed) precision training instead of 32-bit training. --fp16="True" \ #tf32 (bool, optional) — Whether to enable the TF32 mode, available in Ampere and newer GPU architectures. The default value depends on PyTorch’s #version default of torch.backends.cuda.matmul.allow_tf32. This is an experimental API and it may change. #details: https://huggingface.co/docs/transformers/perf_train_gpu_one --tf32="True" \ #gradient_checkpointing (bool, optional, defaults to False) — If True, use gradient checkpointing to save memory at the expense of slower backward #pass. --gradient_checkpointing="False" \ #deepspeed (str or dict, optional) — Use Deepspeed. This is an experimental feature and its API may evolve in the future. The value is either the #location of DeepSpeed json config file (e.g., ds_config.json) or an already loaded json file as a dict” --deepspeed="ds_config.json" \ #auto_find_batch_size (bool, optional, defaults to False) — Whether to find a batch size that will fit into memory automatically through exponential #decay, avoiding CUDA Out-of-Memory errors. Requires accelerate to be installed (pip install accelerate) --auto_find_batch_size="True" \ #lr_scheduler_type (str or SchedulerType, optional, defaults to "linear") — The scheduler type to use. #Scheduler types: "linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup" --lr_scheduler_type="linear" \ #torch_compile (bool, optional, defaults to False) — Whether or not to compile the model using PyTorch 2.0 torch.compile (requires a nighlty install of #PyTorch). If set, the backend will default to "inductor" (can be customized with torch_compile_backend) and the mode will default to "default" (can be #customized with torch_compile_mode). --torch_compile="True" \ #torch_compile_backend (str, optional) — The backend to use in torch.compile. If set to any value, torch_compile will be set to True. #Possible choices are "eager", "aot_eager", "inductor", "nvfuser", "aot_nvfuser", "aot_cudagraphs", "ofi", "fx2trt", "onnxrt" and "ipex". --torch_compile_backend="inductor" \ #torch_compile_mode (str, optional) — The mode to use in torch.compile. If set to any value, torch_compile will be set to True. #Possible choices are "default", "reduce-overhead" and "max-autotune". See: https://pytorch.org/get-started/pytorch-2.0/#modes --torch_compile_mode="default" \ #push_to_hub (bool, optional, defaults to False) — Whether or not to push the model to the Hub every time the model is saved. If this is activated, #output_dir will begin a git directory synced with the repo (determined by hub_model_id) and the content will be pushed each time a save is triggered #(depending on your save_strategy). Calling save_model() will also trigger a push. If output_dir exists, it needs to be a local clone of the repository #to which the Trainer will be pushed. --push_to_hub="False" \ #resume_from_checkpoint (str, optional) — The path to a folder with a valid checkpoint for your model. This argument is not directly used by Trainer, #it’s intended to be used by your training/evaluation scripts instead. See the example scripts for more details. --resume_from_checkpoint="directory" \ #load_best_model_at_end (bool, optional, defaults to False) — Whether or not to load the best model found during training at the end of training. #When set to True, the parameters save_strategy needs to be the same as evaluation_strategy, and in the case it is “steps”, save_steps must be a #round multiple of eval_steps. --load_best_model_at_end="True" \ #metric_for_best_model (str, optional) — Use in conjunction with load_best_model_at_end to specify the metric to use to compare two different models. #Must be the name of a metric returned by the evaluation with or without the prefix "eval_". Will default to "loss" if unspecified and #load_best_model_at_end=True (to use the evaluation loss). If you set this value, greater_is_better will default to True. Don’t forget to set it to #False if your metric is better when lower. --metric_for_best_model="wer" \ #greater_is_better (bool, optional) — Use in conjunction with load_best_model_at_end and metric_for_best_model to specify if better models should have #a greater metric or not. Will default to: True if metric_for_best_model is set to a value that isn’t "loss" or "eval_loss". False if #metric_for_best_model is not set, or set to "loss" or "eval_loss". --greater_is_better="False" #eval_steps (int, optional) — Number of update steps between two evaluations if evaluation_strategy="steps". Will default to the same value as #logging_steps if not set. --eval_steps="1000" \ #dataloader_num_workers (int, optional, defaults to 0) — Number of subprocesses to use for data loading (PyTorch only). 0 means that the data will be #loaded in the main process. --dataloader_num_workers="1" \ #disable_tqdm (bool, optional) — Whether or not to disable the tqdm progress bars and table of metrics produced by ~notebook.NotebookTrainingTracker # in Jupyter Notebooks. Will default to True if the logging level is set to warn or lower (default), False otherwise. --disable_tqdm="False" \ #optim (str or training_args.OptimizerNames, optional, defaults to "adamw_hf") — The optimizer to use: adamw_hf, adamw_torch, adamw_apex_fused, #adamw_anyprecision or adafactor. --optim="adamw_hf" \ See this article for more intuition: https://huggingface.co/docs/transformers/perf_train_gpu_one #cache_dir CACHE_DIR. Where to store the pretrained models downloaded from huggingface.co (default: None) --cache_dir="~/asr_training/models_cache" #max_train_samples MAX_TRAIN_SAMPLES. For debugging purposes or quicker training, truncate the number of training examples to this value if set. #(default: None) --max_train_samples="1000" #max_eval_samples MAX_EVAL_SAMPLES. For debugging purposes or quicker training, truncate the number of evaluation examples to this value if set. #(default: None) --max_eval_samples="100" #train_split_name TRAIN_SPLIT_NAME. The name of the training data set split to use (via the datasets library). Defaults to 'train' (default: train) --train_split_name="train" \ #eval_split_name EVAL_SPLIT_NAME. The name of the training data set split to use (via the datasets library). Defaults to 'train' (default: test) --eval_split_name="valid" \ #do_lower_case [DO_LOWER_CASE]. Whether the target text should be lower cased. (default: False) do_lower_case="False" \ #do_remove_punctuation [DO_REMOVE_PUNCTUATION]. Whether the target text should be striped of punctuation. (default: False) --do_remove_punctuation="False" \ #do_normalize_eval [DO_NORMALIZE_EVAL]. Whether to normalise the references and predictions in the eval WER calculation. (default: True) --do_normalize_eval="True" \ #no_do_normalize_eval. Whether to normalise the references and predictions in the eval WER calculation. (default: False) --no_do_normalize_eval="False" \ |