# 预训练模型路径 dtype: "bf16" text_encoder_path: pretrained_models/Wan2.1-T2V-14B/models_t5_umt5-xxl-enc-bf16.pth image_encoder_path: None dit_path: pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00001-of-00006.safetensors,pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00002-of-00006.safetensors,pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00003-of-00006.safetensors,pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00004-of-00006.safetensors,pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00005-of-00006.safetensors,pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00006-of-00006.safetensors vae_path: pretrained_models/Wan2.1-T2V-14B/Wan2.1_VAE.pth wav2vec_path: pretrained_models/wav2vec2-base-960h exp_path: pretrained_models/OmniAvatar-14B num_persistent_param_in_dit: # You can set `num_persistent_param_in_dit` to a small number to reduce VRAM required. reload_cfg: True sp_size: 1 # 数据参数 seed: 42 image_sizes_720: [[400, 720], [720, 720], [720, 400]] image_sizes_1280: [ [720, 720], [528, 960], [960, 528], [720, 1280], [1280, 720]] max_hw: 720 # 720: 480p; 1280: 720p max_tokens: 30000 seq_len: 200 overlap_frame: 13 # must be 1 + 4*n guidance_scale: 4.5 audio_scale: num_steps: 50 fps: 25 sample_rate: 16000 negative_prompt: "Vivid color tones, background/camera moving quickly, screen switching, subtitles and special effects, mutation, overexposed, static, blurred details, subtitles, style, work, painting, image, still, overall grayish, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn face, deformed, disfigured, malformed limbs, fingers merging, motionless image, chaotic background, three legs, crowded background with many people, walking backward" silence_duration_s: 0.3 use_fsdp: False tea_cache_l1_thresh: 0 # 0.14 The larger this value is, the faster the speed, but the worse the visual quality. TODO check value