File size: 9,547 Bytes
5c20d4e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
echo 'python run_speech_recognition_seq2seq_streaming.py \
--model_name_or_path="openai/whisper-small" \
--dataset_name="mozilla-foundation/common_voice_11_0" \
--dataset_config_name="bn" \
--language="bengali" \
--train_split_name="train+validation" \
--eval_split_name="test" \
--model_index_name="Whisper Small Bengali" \
--output_dir="./" \
--overwrite_output_dir \
--max_steps="60000" \
--per_device_train_batch_size="4" \
--per_device_eval_batch_size="2" \
--gradient_accumulation_steps="8" \
--gradient_checkpointing="False" \
--evaluation_strategy="steps" \
--eval_steps="1000" \
--save_strategy="steps" \
--save_steps="1000" \
--save_total_limit="5" \
--learning_rate="1e-5" \
--warmup_steps="5000" \
--logging_steps="25" \
--weight_decay="0.01" \
--load_best_model_at_end="True" \
--metric_for_best_model="wer" \
--greater_is_better="False" \
--bf16="True" \
--tf32="True" \
--streaming="False" \
--generation_max_length="225" \
--length_column_name="input_length" \
--max_duration_in_seconds="30" \
--text_column_name="sentence" \
--freeze_feature_encoder="False" \
--report_to="tensorboard" \
--do_train \
--do_eval \
--predict_with_generate \
--do_normalize_eval \
--use_auth_token \
--push_to_hub' >> run.sh

#max_steps MAX_STEPS - If > 0: set total number of training steps to perform. Override num_train_epochs. (default: -1)

	--max_steps="20000" \


#output_dir OUTPUT_DIR - The output directory where the model predictions and checkpoints will be written. (default: None)

	--output_dir="./" \


#overwrite_output_dir [OVERWRITE_OUTPUT_DIR] - Overwrite the content of the output directory. Use this to continue training if output_dir points to a
#checkpoint directory. (default: False)

	--overwrite_output_dir \


#weight_decay (float, optional, defaults to 0) — The weight decay to apply (if not zero) to all layers except all bias and LayerNorm weights in AdamW 
optimizer. weight decay prevents overfitting. visit: https://towardsdatascience.com/this-thing-called-weight-decay-a7cd4bcfccab
# 0.1-just right, 0.01-takes more epochs to fit, 10-never quite fits

	--weight_decay="0.01" \


#bf16 (bool, optional, defaults to False) — Whether to use bf16 16-bit (mixed) precision training instead of 32-bit training. Requires Ampere or higher 
NVIDIA architecture or using CPU (no_cuda). This is an experimental API and it may change.

	--bf16="True" \
	

#fp16 (bool, optional, defaults to False) — Whether to use fp16 16-bit (mixed) precision training instead of 32-bit training.

	--fp16="True" \
	

#tf32 (bool, optional) — Whether to enable the TF32 mode, available in Ampere and newer GPU architectures. The default value depends on PyTorch’s 
#version default of torch.backends.cuda.matmul.allow_tf32. This is an experimental API and it may change.
#details: https://huggingface.co/docs/transformers/perf_train_gpu_one

	--tf32="True" \
	

#gradient_checkpointing (bool, optional, defaults to False) — If True, use gradient checkpointing to save memory at the expense of slower backward
#pass.

	--gradient_checkpointing="False" \
	


#deepspeed (str or dict, optional) — Use Deepspeed. This is an experimental feature and its API may evolve in the future. The value is either the 
#location of DeepSpeed json config file (e.g., ds_config.json) or an already loaded json file as a dict”

	--deepspeed="ds_config.json" \
	
	
#auto_find_batch_size (bool, optional, defaults to False) — Whether to find a batch size that will fit into memory automatically through exponential
#decay, avoiding CUDA Out-of-Memory errors. Requires accelerate to be installed (pip install accelerate)

	--auto_find_batch_size="True" \


#lr_scheduler_type (str or SchedulerType, optional, defaults to "linear") — The scheduler type to use.
#Scheduler types: "linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"

	--lr_scheduler_type="linear" \
	

#torch_compile (bool, optional, defaults to False) — Whether or not to compile the model using PyTorch 2.0 torch.compile (requires a nighlty install of
#PyTorch). If set, the backend will default to "inductor" (can be customized with torch_compile_backend) and the mode will default to "default" (can be
#customized with torch_compile_mode).

	--torch_compile="True" \


#torch_compile_backend (str, optional) — The backend to use in torch.compile. If set to any value, torch_compile will be set to True.
#Possible choices are "eager", "aot_eager", "inductor", "nvfuser", "aot_nvfuser", "aot_cudagraphs", "ofi", "fx2trt", "onnxrt" and "ipex".

	--torch_compile_backend="inductor" \

#torch_compile_mode (str, optional) — The mode to use in torch.compile. If set to any value, torch_compile will be set to True.
#Possible choices are "default", "reduce-overhead" and "max-autotune". See: https://pytorch.org/get-started/pytorch-2.0/#modes

	--torch_compile_mode="default" \

	
#push_to_hub (bool, optional, defaults to False) — Whether or not to push the model to the Hub every time the model is saved. If this is activated, 
#output_dir will begin a git directory synced with the repo (determined by hub_model_id) and the content will be pushed each time a save is triggered
#(depending on your save_strategy). Calling save_model() will also trigger a push. If output_dir exists, it needs to be a local clone of the repository
#to which the Trainer will be pushed.
	
	--push_to_hub="False" \
	

#resume_from_checkpoint (str, optional) — The path to a folder with a valid checkpoint for your model. This argument is not directly used by Trainer, 
#it’s intended to be used by your training/evaluation scripts instead. See the example scripts for more details.

	--resume_from_checkpoint="directory" \
	
	
#load_best_model_at_end (bool, optional, defaults to False) — Whether or not to load the best model found during training at the end of training.
#When set to True, the parameters save_strategy needs to be the same as evaluation_strategy, and in the case it is “steps”, save_steps must be a 
#round multiple of eval_steps.

	--load_best_model_at_end="True" \
	

#metric_for_best_model (str, optional) — Use in conjunction with load_best_model_at_end to specify the metric to use to compare two different models. 
#Must be the name of a metric returned by the evaluation with or without the prefix "eval_". Will default to "loss" if unspecified and 
#load_best_model_at_end=True (to use the evaluation loss). If you set this value, greater_is_better will default to True. Don’t forget to set it to 
#False if your metric is better when lower.

	--metric_for_best_model="wer" \
	

#greater_is_better (bool, optional) — Use in conjunction with load_best_model_at_end and metric_for_best_model to specify if better models should have
#a greater metric or not. Will default to: True if metric_for_best_model is set to a value that isn’t "loss" or "eval_loss". False if 
#metric_for_best_model is not set, or set to "loss" or "eval_loss".

	--greater_is_better="False"


#eval_steps (int, optional) — Number of update steps between two evaluations if evaluation_strategy="steps". Will default to the same value as 
#logging_steps if not set.

	--eval_steps="1000" \
	

#dataloader_num_workers (int, optional, defaults to 0) — Number of subprocesses to use for data loading (PyTorch only). 0 means that the data will be 
#loaded in the main process.

	--dataloader_num_workers="1" \
	

#disable_tqdm (bool, optional) — Whether or not to disable the tqdm progress bars and table of metrics produced by ~notebook.NotebookTrainingTracker 
# in Jupyter Notebooks. Will default to True if the logging level is set to warn or lower (default), False otherwise.

	--disable_tqdm="False" \

#optim (str or training_args.OptimizerNames, optional, defaults to "adamw_hf") — The optimizer to use: adamw_hf, adamw_torch, adamw_apex_fused, 
#adamw_anyprecision or adafactor.

	--optim="adamw_hf" \


See this article for more intuition:

		https://huggingface.co/docs/transformers/perf_train_gpu_one
		

#cache_dir CACHE_DIR. Where to store the pretrained models downloaded from huggingface.co (default: None)

	--cache_dir="~/asr_training/models_cache"


#max_train_samples MAX_TRAIN_SAMPLES. For debugging purposes or quicker training, truncate the number of training examples to this value if set.
#(default: None)

	--max_train_samples="1000"
	
	
#max_eval_samples MAX_EVAL_SAMPLES. For debugging purposes or quicker training, truncate the number of evaluation examples to this value if set.
#(default: None)

	--max_eval_samples="100"
	

#train_split_name TRAIN_SPLIT_NAME. The name of the training data set split to use (via the datasets library). Defaults to 'train' (default: train)

	--train_split_name="train" \
	
	
#eval_split_name EVAL_SPLIT_NAME. The name of the training data set split to use (via the datasets library). Defaults to 'train' (default: test)

	--eval_split_name="valid" \
	

#do_lower_case [DO_LOWER_CASE]. Whether the target text should be lower cased. (default: False)

	do_lower_case="False" \

#do_remove_punctuation [DO_REMOVE_PUNCTUATION]. Whether the target text should be striped of punctuation. (default: False)

	--do_remove_punctuation="False" \
	
#do_normalize_eval [DO_NORMALIZE_EVAL]. Whether to normalise the references and predictions in the eval WER calculation. (default: True)

	--do_normalize_eval="True" \
	
#no_do_normalize_eval. Whether to normalise the references and predictions in the eval WER calculation. (default: False)

	--no_do_normalize_eval="False" \