|
#!/bin/bash |
|
|
|
|
|
|
|
|
|
|
|
echo "Single-speaker fine-tuning on cleaned dataset..." |
|
echo "Using 544 clean samples (no cutoffs)" |
|
echo "NO voice prompts - training pure Elise TTS model" |
|
|
|
python -m src.finetune_vibevoice_lora \ |
|
--model_name_or_path . \ |
|
--train_jsonl elise_cleaned/train_split.jsonl \ |
|
--validation_jsonl elise_cleaned/val.jsonl \ |
|
--text_column_name text \ |
|
--audio_column_name audio \ |
|
--output_dir finetune_elise_single_speaker \ |
|
--per_device_train_batch_size 4 \ |
|
--gradient_accumulation_steps 8 \ |
|
--learning_rate 2.5e-5 \ |
|
--num_train_epochs 4 \ |
|
--logging_steps 10 \ |
|
--save_steps 100 \ |
|
--eval_steps 100 \ |
|
--report_to none \ |
|
--remove_unused_columns False \ |
|
--bf16 True \ |
|
--do_train \ |
|
--do_eval \ |
|
--gradient_clipping \ |
|
--gradient_checkpointing False \ |
|
--ddpm_batch_mul 2 \ |
|
--diffusion_loss_weight 1.4 \ |
|
--train_diffusion_head True \ |
|
--ce_loss_weight 0.04 \ |
|
--voice_prompt_drop_rate 1.0 \ |
|
--lora_target_modules q_proj,k_proj,v_proj,o_proj,gate_proj,up_proj,down_proj \ |
|
--lr_scheduler_type cosine \ |
|
--warmup_ratio 0.03 \ |
|
--max_grad_norm 0.8 \ |
|
--max_length 4096 |
|
|
|
echo "Single-speaker fine-tuning complete!" |
|
echo "Model will now generate Elise voice from text only - no voice prompts needed!" |
|
|