pengyizhou's picture
update
3216a37
# Configuration for training only on Khmer (FLEURS km_kh) data
# Fine-tuning Whisper on Khmer language using Google FLEURS dataset
# Model Configuration
model:
checkpoint: "openai/whisper-large-v3"
max_target_length: 448
# Output Configuration
output:
output_dir: "./whisper-fleurs-km_kh-small"
# Environment Configuration
environment:
max_cpu_cores: 20
test_cpu_cores: 20
omp_num_threads: "20"
mkl_num_threads: "20"
openblas_num_threads: "20"
veclib_maximum_threads: "20"
numexpr_num_threads: "20"
tokenizers_parallelism: "false"
transformers_no_tf: "1"
# Audio Processing Configuration
audio:
sampling_rate: 16000
# Language Configurations - Khmer only
languages:
khmer:
whisper_language: "khmer"
fleurs_language: "km_kh"
text_key: "transcription"
train_subset_ratio: 0.25 # Use only 25% of training data for faster training/experimentation
# Dataset Configurations - Khmer FLEURS
datasets:
khmer:
source: "google/fleurs"
language_code: "km_kh"
splits:
train: "train"
validation: "validation"
test: "test"
trust_remote_code: true
# Training Configuration
training:
# Basic training parameters
learning_rate: 1.0e-5
warmup_steps: 100
max_steps: 800
# Batch size and accumulation
single_gpu:
per_device_train_batch_size: 16
per_device_eval_batch_size: 16
gradient_accumulation_steps: 1
# Optimization settings
gradient_checkpointing: true
fp16: true
# Evaluation settings
eval_strategy: "steps"
eval_steps: 100
predict_with_generate: true
generation_max_length: 225
# Saving and logging
save_steps: 100
logging_steps: 10
save_total_limit: 3
# Model selection
load_best_model_at_end: true
metric_for_best_model: "cer" # Using CER for Khmer (character-based language)
greater_is_better: false
# Reporting
report_to:
- "tensorboard"
# Hub settings
push_to_hub: false
# Multi-GPU specific settings
dataloader_drop_last: true
ddp_find_unused_parameters: false
# Data Processing Configuration
data_processing:
# Random seed for reproducibility
seed: 42
# Columns to remove during standardization
columns_to_remove:
- "id"
- "num_samples"
- "path"
- "speaker_id"
- "chapter_id"
- "segment_id"