Spaces:
Running
on
Zero
Running
on
Zero
| config: configs/inference.yaml | |
| input_file: examples/infer_samples.txt | |
| debug: null | |
| infer: false | |
| hparams: '' | |
| dtype: bf16 | |
| exp_path: pretrained_models/OmniAvatar-14B | |
| text_encoder_path: pretrained_models/Wan2.1-T2V-14B/models_t5_umt5-xxl-enc-bf16.pth | |
| image_encoder_path: None | |
| dit_path: pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00001-of-00006.safetensors,pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00002-of-00006.safetensors,pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00003-of-00006.safetensors,pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00004-of-00006.safetensors,pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00005-of-00006.safetensors,pretrained_models/Wan2.1-T2V-14B/diffusion_pytorch_model-00006-of-00006.safetensors | |
| vae_path: pretrained_models/Wan2.1-T2V-14B/Wan2.1_VAE.pth | |
| # exp_path: pretrained_models/OmniAvatar-1.3B | |
| # text_encoder_path: pretrained_models/Wan2.1-T2V-1.3B/models_t5_umt5-xxl-enc-bf16.pth | |
| # image_encoder_path: None | |
| # dit_path: pretrained_models/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors | |
| # vae_path: pretrained_models/Wan2.1-T2V-1.3B/Wan2.1_VAE.pth | |
| wav2vec_path: pretrained_models/wav2vec2-base-960h | |
| num_persistent_param_in_dit: | |
| reload_cfg: true | |
| sp_size: 1 | |
| seed: 42 | |
| image_sizes_720: | |
| # - - 400 | |
| # - 720 | |
| # - - 720 commented out due duration needed on HF | |
| # - 720 | |
| - - 720 | |
| - 400 | |
| image_sizes_1280: | |
| - - 720 | |
| - 720 | |
| - - 528 | |
| - 960 | |
| - - 960 | |
| - 528 | |
| - - 720 | |
| - 1280 | |
| - - 1280 | |
| - 720 | |
| max_hw: 720 | |
| max_tokens: 40000 | |
| seq_len: 200 | |
| overlap_frame: 13 | |
| guidance_scale: 4.5 | |
| audio_scale: null | |
| num_steps: 8 | |
| fps: 24 | |
| sample_rate: 16000 | |
| negative_prompt: Vivid color tones, background/camera moving quickly, screen switching, | |
| subtitles and special effects, mutation, overexposed, static, blurred details, subtitles, | |
| style, work, painting, image, still, overall grayish, worst quality, low quality, | |
| JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly | |
| drawn face, deformed, disfigured, malformed limbs, fingers merging, motionless image, | |
| chaotic background, three legs, crowded background with many people, walking backward | |
| silence_duration_s: 0.0 | |
| use_fsdp: false | |
| tea_cache_l1_thresh: 0 | |
| rank: 0 | |
| world_size: 1 | |
| local_rank: 0 | |
| device: cuda | |
| num_nodes: 1 | |
| i2v: true | |
| use_audio: true | |
| random_prefix_frames: true | |
| model_config: | |
| in_dim: 33 | |
| audio_hidden_size: 32 | |
| train_architecture: lora | |
| lora_target_modules: q,k,v,o,ffn.0,ffn.2 | |
| init_lora_weights: kaiming | |
| lora_rank: 128 | |
| lora_alpha: 64.0 |