Vibevoice_1_5_lora / finetune_elise_single_speaker.sh
DevParker's picture
Upload 8 files
86e8346 verified
#!/bin/bash
# Single-speaker fine-tuning script for VibeVoice-1.5B on CLEANED Elise dataset
# No voice prompts - pure text-to-speech for Elise voice only
# WITH PROPER EOS TOKEN to fix repetition/looping issue
echo "Single-speaker fine-tuning on cleaned dataset..."
echo "Using 544 clean samples (no cutoffs)"
echo "NO voice prompts - training pure Elise TTS model"
python -m src.finetune_vibevoice_lora \
--model_name_or_path . \
--train_jsonl elise_cleaned/train_split.jsonl \
--validation_jsonl elise_cleaned/val.jsonl \
--text_column_name text \
--audio_column_name audio \
--output_dir finetune_elise_single_speaker \
--per_device_train_batch_size 4 \
--gradient_accumulation_steps 8 \
--learning_rate 2.5e-5 \
--num_train_epochs 4 \
--logging_steps 10 \
--save_steps 100 \
--eval_steps 100 \
--report_to none \
--remove_unused_columns False \
--bf16 True \
--do_train \
--do_eval \
--gradient_clipping \
--gradient_checkpointing False \
--ddpm_batch_mul 2 \
--diffusion_loss_weight 1.4 \
--train_diffusion_head True \
--ce_loss_weight 0.04 \
--voice_prompt_drop_rate 1.0 \
--lora_target_modules q_proj,k_proj,v_proj,o_proj,gate_proj,up_proj,down_proj \
--lr_scheduler_type cosine \
--warmup_ratio 0.03 \
--max_grad_norm 0.8 \
--max_length 4096
echo "Single-speaker fine-tuning complete!"
echo "Model will now generate Elise voice from text only - no voice prompts needed!"