# Configuration for training only on Khmer (FLEURS km_kh) data # Fine-tuning Whisper on Khmer language using Google FLEURS dataset # Model Configuration model: checkpoint: "openai/whisper-large-v3" max_target_length: 448 # Output Configuration output: output_dir: "./whisper-fleurs-km_kh-small" # Environment Configuration environment: max_cpu_cores: 20 test_cpu_cores: 20 omp_num_threads: "20" mkl_num_threads: "20" openblas_num_threads: "20" veclib_maximum_threads: "20" numexpr_num_threads: "20" tokenizers_parallelism: "false" transformers_no_tf: "1" # Audio Processing Configuration audio: sampling_rate: 16000 # Language Configurations - Khmer only languages: khmer: whisper_language: "khmer" fleurs_language: "km_kh" text_key: "transcription" train_subset_ratio: 0.25 # Use only 25% of training data for faster training/experimentation # Dataset Configurations - Khmer FLEURS datasets: khmer: source: "google/fleurs" language_code: "km_kh" splits: train: "train" validation: "validation" test: "test" trust_remote_code: true # Training Configuration training: # Basic training parameters learning_rate: 1.0e-5 warmup_steps: 100 max_steps: 800 # Batch size and accumulation single_gpu: per_device_train_batch_size: 16 per_device_eval_batch_size: 16 gradient_accumulation_steps: 1 # Optimization settings gradient_checkpointing: true fp16: true # Evaluation settings eval_strategy: "steps" eval_steps: 100 predict_with_generate: true generation_max_length: 225 # Saving and logging save_steps: 100 logging_steps: 10 save_total_limit: 3 # Model selection load_best_model_at_end: true metric_for_best_model: "cer" # Using CER for Khmer (character-based language) greater_is_better: false # Reporting report_to: - "tensorboard" # Hub settings push_to_hub: false # Multi-GPU specific settings dataloader_drop_last: true ddp_find_unused_parameters: false # Data Processing Configuration data_processing: # Random seed for reproducibility seed: 42 # Columns to remove during standardization columns_to_remove: - "id" - "num_samples" - "path" - "speaker_id" - "chapter_id" - "segment_id"