|
echo 'python run_speech_recognition_seq2seq_streaming.py \ |
|
--model_name_or_path="openai/whisper-small" \ |
|
--dataset_name="mozilla-foundation/common_voice_11_0" \ |
|
--dataset_config_name="bn" \ |
|
--language="bengali" \ |
|
--train_split_name="train+validation" \ |
|
--eval_split_name="test" \ |
|
--model_index_name="Whisper Small Bengali" \ |
|
--output_dir="./" \ |
|
--overwrite_output_dir \ |
|
--max_steps="60000" \ |
|
--per_device_train_batch_size="4" \ |
|
--per_device_eval_batch_size="2" \ |
|
--gradient_accumulation_steps="8" \ |
|
--gradient_checkpointing="False" \ |
|
--evaluation_strategy="steps" \ |
|
--eval_steps="1000" \ |
|
--save_strategy="steps" \ |
|
--save_steps="1000" \ |
|
--save_total_limit="5" \ |
|
--learning_rate="1e-5" \ |
|
--warmup_steps="5000" \ |
|
--logging_steps="25" \ |
|
--weight_decay="0.01" \ |
|
--load_best_model_at_end="True" \ |
|
--metric_for_best_model="wer" \ |
|
--greater_is_better="False" \ |
|
--bf16="True" \ |
|
--tf32="True" \ |
|
--streaming="False" \ |
|
--generation_max_length="225" \ |
|
--length_column_name="input_length" \ |
|
--max_duration_in_seconds="30" \ |
|
--text_column_name="sentence" \ |
|
--freeze_feature_encoder="False" \ |
|
--report_to="tensorboard" \ |
|
--do_train \ |
|
--do_eval \ |
|
--predict_with_generate \ |
|
--do_normalize_eval \ |
|
--use_auth_token \ |
|
--push_to_hub' >> run.sh |
|
|
|
#max_steps MAX_STEPS - If > 0: set total number of training steps to perform. Override num_train_epochs. (default: -1) |
|
|
|
--max_steps="20000" \ |
|
|
|
|
|
#output_dir OUTPUT_DIR - The output directory where the model predictions and checkpoints will be written. (default: None) |
|
|
|
--output_dir="./" \ |
|
|
|
|
|
#overwrite_output_dir [OVERWRITE_OUTPUT_DIR] - Overwrite the content of the output directory. Use this to continue training if output_dir points to a |
|
#checkpoint directory. (default: False) |
|
|
|
--overwrite_output_dir \ |
|
|
|
|
|
#weight_decay (float, optional, defaults to 0) — The weight decay to apply (if not zero) to all layers except all bias and LayerNorm weights in AdamW |
|
optimizer. weight decay prevents overfitting. visit: https://towardsdatascience.com/this-thing-called-weight-decay-a7cd4bcfccab |
|
# 0.1-just right, 0.01-takes more epochs to fit, 10-never quite fits |
|
|
|
--weight_decay="0.01" \ |
|
|
|
|
|
#bf16 (bool, optional, defaults to False) — Whether to use bf16 16-bit (mixed) precision training instead of 32-bit training. Requires Ampere or higher |
|
NVIDIA architecture or using CPU (no_cuda). This is an experimental API and it may change. |
|
|
|
--bf16="True" \ |
|
|
|
|
|
#fp16 (bool, optional, defaults to False) — Whether to use fp16 16-bit (mixed) precision training instead of 32-bit training. |
|
|
|
--fp16="True" \ |
|
|
|
|
|
#tf32 (bool, optional) — Whether to enable the TF32 mode, available in Ampere and newer GPU architectures. The default value depends on PyTorch’s |
|
#version default of torch.backends.cuda.matmul.allow_tf32. This is an experimental API and it may change. |
|
#details: https://huggingface.co/docs/transformers/perf_train_gpu_one |
|
|
|
--tf32="True" \ |
|
|
|
|
|
#gradient_checkpointing (bool, optional, defaults to False) — If True, use gradient checkpointing to save memory at the expense of slower backward |
|
#pass. |
|
|
|
--gradient_checkpointing="False" \ |
|
|
|
|
|
|
|
#deepspeed (str or dict, optional) — Use Deepspeed. This is an experimental feature and its API may evolve in the future. The value is either the |
|
#location of DeepSpeed json config file (e.g., ds_config.json) or an already loaded json file as a dict” |
|
|
|
--deepspeed="ds_config.json" \ |
|
|
|
|
|
#auto_find_batch_size (bool, optional, defaults to False) — Whether to find a batch size that will fit into memory automatically through exponential |
|
#decay, avoiding CUDA Out-of-Memory errors. Requires accelerate to be installed (pip install accelerate) |
|
|
|
--auto_find_batch_size="True" \ |
|
|
|
|
|
#lr_scheduler_type (str or SchedulerType, optional, defaults to "linear") — The scheduler type to use. |
|
#Scheduler types: "linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup" |
|
|
|
--lr_scheduler_type="linear" \ |
|
|
|
|
|
#torch_compile (bool, optional, defaults to False) — Whether or not to compile the model using PyTorch 2.0 torch.compile (requires a nighlty install of |
|
#PyTorch). If set, the backend will default to "inductor" (can be customized with torch_compile_backend) and the mode will default to "default" (can be |
|
#customized with torch_compile_mode). |
|
|
|
--torch_compile="True" \ |
|
|
|
|
|
#torch_compile_backend (str, optional) — The backend to use in torch.compile. If set to any value, torch_compile will be set to True. |
|
#Possible choices are "eager", "aot_eager", "inductor", "nvfuser", "aot_nvfuser", "aot_cudagraphs", "ofi", "fx2trt", "onnxrt" and "ipex". |
|
|
|
--torch_compile_backend="inductor" \ |
|
|
|
#torch_compile_mode (str, optional) — The mode to use in torch.compile. If set to any value, torch_compile will be set to True. |
|
#Possible choices are "default", "reduce-overhead" and "max-autotune". See: https://pytorch.org/get-started/pytorch-2.0/#modes |
|
|
|
--torch_compile_mode="default" \ |
|
|
|
|
|
#push_to_hub (bool, optional, defaults to False) — Whether or not to push the model to the Hub every time the model is saved. If this is activated, |
|
#output_dir will begin a git directory synced with the repo (determined by hub_model_id) and the content will be pushed each time a save is triggered |
|
#(depending on your save_strategy). Calling save_model() will also trigger a push. If output_dir exists, it needs to be a local clone of the repository |
|
#to which the Trainer will be pushed. |
|
|
|
--push_to_hub="False" \ |
|
|
|
|
|
#resume_from_checkpoint (str, optional) — The path to a folder with a valid checkpoint for your model. This argument is not directly used by Trainer, |
|
#it’s intended to be used by your training/evaluation scripts instead. See the example scripts for more details. |
|
|
|
--resume_from_checkpoint="directory" \ |
|
|
|
|
|
#load_best_model_at_end (bool, optional, defaults to False) — Whether or not to load the best model found during training at the end of training. |
|
#When set to True, the parameters save_strategy needs to be the same as evaluation_strategy, and in the case it is “steps”, save_steps must be a |
|
#round multiple of eval_steps. |
|
|
|
--load_best_model_at_end="True" \ |
|
|
|
|
|
#metric_for_best_model (str, optional) — Use in conjunction with load_best_model_at_end to specify the metric to use to compare two different models. |
|
#Must be the name of a metric returned by the evaluation with or without the prefix "eval_". Will default to "loss" if unspecified and |
|
#load_best_model_at_end=True (to use the evaluation loss). If you set this value, greater_is_better will default to True. Don’t forget to set it to |
|
#False if your metric is better when lower. |
|
|
|
--metric_for_best_model="wer" \ |
|
|
|
|
|
#greater_is_better (bool, optional) — Use in conjunction with load_best_model_at_end and metric_for_best_model to specify if better models should have |
|
#a greater metric or not. Will default to: True if metric_for_best_model is set to a value that isn’t "loss" or "eval_loss". False if |
|
#metric_for_best_model is not set, or set to "loss" or "eval_loss". |
|
|
|
--greater_is_better="False" |
|
|
|
|
|
#eval_steps (int, optional) — Number of update steps between two evaluations if evaluation_strategy="steps". Will default to the same value as |
|
#logging_steps if not set. |
|
|
|
--eval_steps="1000" \ |
|
|
|
|
|
#dataloader_num_workers (int, optional, defaults to 0) — Number of subprocesses to use for data loading (PyTorch only). 0 means that the data will be |
|
#loaded in the main process. |
|
|
|
--dataloader_num_workers="1" \ |
|
|
|
|
|
#disable_tqdm (bool, optional) — Whether or not to disable the tqdm progress bars and table of metrics produced by ~notebook.NotebookTrainingTracker |
|
# in Jupyter Notebooks. Will default to True if the logging level is set to warn or lower (default), False otherwise. |
|
|
|
--disable_tqdm="False" \ |
|
|
|
#optim (str or training_args.OptimizerNames, optional, defaults to "adamw_hf") — The optimizer to use: adamw_hf, adamw_torch, adamw_apex_fused, |
|
#adamw_anyprecision or adafactor. |
|
|
|
--optim="adamw_hf" \ |
|
|
|
|
|
See this article for more intuition: |
|
|
|
https://huggingface.co/docs/transformers/perf_train_gpu_one |
|
|
|
|
|
#cache_dir CACHE_DIR. Where to store the pretrained models downloaded from huggingface.co (default: None) |
|
|
|
--cache_dir="~/asr_training/models_cache" |
|
|
|
|
|
#max_train_samples MAX_TRAIN_SAMPLES. For debugging purposes or quicker training, truncate the number of training examples to this value if set. |
|
#(default: None) |
|
|
|
--max_train_samples="1000" |
|
|
|
|
|
#max_eval_samples MAX_EVAL_SAMPLES. For debugging purposes or quicker training, truncate the number of evaluation examples to this value if set. |
|
#(default: None) |
|
|
|
--max_eval_samples="100" |
|
|
|
|
|
#train_split_name TRAIN_SPLIT_NAME. The name of the training data set split to use (via the datasets library). Defaults to 'train' (default: train) |
|
|
|
--train_split_name="train" \ |
|
|
|
|
|
#eval_split_name EVAL_SPLIT_NAME. The name of the training data set split to use (via the datasets library). Defaults to 'train' (default: test) |
|
|
|
--eval_split_name="valid" \ |
|
|
|
|
|
#do_lower_case [DO_LOWER_CASE]. Whether the target text should be lower cased. (default: False) |
|
|
|
do_lower_case="False" \ |
|
|
|
#do_remove_punctuation [DO_REMOVE_PUNCTUATION]. Whether the target text should be striped of punctuation. (default: False) |
|
|
|
--do_remove_punctuation="False" \ |
|
|
|
#do_normalize_eval [DO_NORMALIZE_EVAL]. Whether to normalise the references and predictions in the eval WER calculation. (default: True) |
|
|
|
--do_normalize_eval="True" \ |
|
|
|
#no_do_normalize_eval. Whether to normalise the references and predictions in the eval WER calculation. (default: False) |
|
|
|
--no_do_normalize_eval="False" \ |
|
|
|
|
|
|
|
|
|
|
|
|
|
|