#!/bin/bash # Single-speaker fine-tuning script for VibeVoice-1.5B on CLEANED Elise dataset # No voice prompts - pure text-to-speech for Elise voice only # WITH PROPER EOS TOKEN to fix repetition/looping issue echo "Single-speaker fine-tuning on cleaned dataset..." echo "Using 544 clean samples (no cutoffs)" echo "NO voice prompts - training pure Elise TTS model" python -m src.finetune_vibevoice_lora \ --model_name_or_path . \ --train_jsonl elise_cleaned/train_split.jsonl \ --validation_jsonl elise_cleaned/val.jsonl \ --text_column_name text \ --audio_column_name audio \ --output_dir finetune_elise_single_speaker \ --per_device_train_batch_size 4 \ --gradient_accumulation_steps 8 \ --learning_rate 2.5e-5 \ --num_train_epochs 4 \ --logging_steps 10 \ --save_steps 100 \ --eval_steps 100 \ --report_to none \ --remove_unused_columns False \ --bf16 True \ --do_train \ --do_eval \ --gradient_clipping \ --gradient_checkpointing False \ --ddpm_batch_mul 2 \ --diffusion_loss_weight 1.4 \ --train_diffusion_head True \ --ce_loss_weight 0.04 \ --voice_prompt_drop_rate 1.0 \ --lora_target_modules q_proj,k_proj,v_proj,o_proj,gate_proj,up_proj,down_proj \ --lr_scheduler_type cosine \ --warmup_ratio 0.03 \ --max_grad_norm 0.8 \ --max_length 4096 echo "Single-speaker fine-tuning complete!" echo "Model will now generate Elise voice from text only - no voice prompts needed!"