zxymimi23451 commited on
Commit
78360e7
·
verified ·
1 Parent(s): 00171d0

Upload 258 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +9 -0
  2. .gitignore +42 -0
  3. LICENSE.txt +17 -0
  4. README.md +175 -13
  5. assets/comp_effic.png +3 -0
  6. assets/data_for_diff_stage.jpg +3 -0
  7. assets/i2v_res.png +3 -0
  8. assets/logo.png +0 -0
  9. assets/t2v_res.jpg +3 -0
  10. assets/vben_vs_sota.png +3 -0
  11. assets/video_dit_arch.jpg +3 -0
  12. assets/video_vae_res.jpg +3 -0
  13. configs/fantasy.json +15 -0
  14. configs/flf2v_720p.json +14 -0
  15. configs/i2v.json +14 -0
  16. configs/i2v_720p.json +14 -0
  17. configs/phantom_1.3B.json +14 -0
  18. configs/phantom_14B.json +14 -0
  19. configs/sky_df_1.3.json +14 -0
  20. configs/sky_df_14B.json +14 -0
  21. configs/t2v.json +14 -0
  22. configs/t2v_1.3B.json +14 -0
  23. configs/vace_1.3B.json +16 -0
  24. configs/vace_14B.json +16 -0
  25. docs/CHANGELOG.md +209 -0
  26. docs/CLI.md +226 -0
  27. docs/FINETUNES.md +85 -0
  28. docs/GETTING_STARTED.md +194 -0
  29. docs/INSTALLATION.md +170 -0
  30. docs/LORAS.md +252 -0
  31. docs/MODELS.md +275 -0
  32. docs/TROUBLESHOOTING.md +338 -0
  33. docs/VACE.md +214 -0
  34. fantasytalking/infer.py +36 -0
  35. fantasytalking/model.py +162 -0
  36. fantasytalking/utils.py +52 -0
  37. finetunes/hunyuan_t2v_accvideo.json +30 -0
  38. finetunes/hunyuan_t2v_fast.json +31 -0
  39. finetunes/t2v_fusionix.json +38 -0
  40. finetunes/t2v_sf.json +38 -0
  41. finetunes/vace_14B_fusionix.json +40 -0
  42. finetunes/vace_14B_sf.json +41 -0
  43. hyvideo/__init__.py +0 -0
  44. hyvideo/config.py +534 -0
  45. hyvideo/constants.py +164 -0
  46. hyvideo/data_kits/audio_dataset.py +170 -0
  47. hyvideo/data_kits/audio_preprocessor.py +76 -0
  48. hyvideo/data_kits/data_tools.py +41 -0
  49. hyvideo/data_kits/face_align/__init__.py +1 -0
  50. hyvideo/data_kits/face_align/align.py +34 -0
.gitattributes CHANGED
@@ -33,3 +33,12 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ assets/comp_effic.png filter=lfs diff=lfs merge=lfs -text
37
+ assets/data_for_diff_stage.jpg filter=lfs diff=lfs merge=lfs -text
38
+ assets/i2v_res.png filter=lfs diff=lfs merge=lfs -text
39
+ assets/t2v_res.jpg filter=lfs diff=lfs merge=lfs -text
40
+ assets/vben_vs_sota.png filter=lfs diff=lfs merge=lfs -text
41
+ assets/video_dit_arch.jpg filter=lfs diff=lfs merge=lfs -text
42
+ assets/video_vae_res.jpg filter=lfs diff=lfs merge=lfs -text
43
+ preprocessing/matanyone/tutorial_multi_targets.mp4 filter=lfs diff=lfs merge=lfs -text
44
+ preprocessing/matanyone/tutorial_single_target.mp4 filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .*
2
+ *.py[cod]
3
+ # *.jpg
4
+ *.jpeg
5
+ # *.png
6
+ *.gif
7
+ *.bmp
8
+ *.mp4
9
+ *.mov
10
+ *.mkv
11
+ *.log
12
+ *.zip
13
+ *.pt
14
+ *.pth
15
+ *.ckpt
16
+ *.safetensors
17
+ *.json
18
+ # *.txt
19
+ *.backup
20
+ *.pkl
21
+ *.html
22
+ *.pdf
23
+ *.whl
24
+ *.exe
25
+ cache
26
+ __pycache__/
27
+ storage/
28
+ samples/
29
+ !.gitignore
30
+ !requirements.txt
31
+ .DS_Store
32
+ *DS_Store
33
+ google/
34
+ Wan2.1-T2V-14B/
35
+ Wan2.1-T2V-1.3B/
36
+ Wan2.1-I2V-14B-480P/
37
+ Wan2.1-I2V-14B-720P/
38
+ outputs/
39
+ gradio_outputs/
40
+ ckpts/
41
+ loras/
42
+ loras_i2v/
LICENSE.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FREE for Non Commercial USE
2
+
3
+ You are free to:
4
+ - Share — copy and redistribute the material in any medium or format
5
+ - Adapt — remix, transform, and build upon the material
6
+ The licensor cannot revoke these freedoms as long as you follow the license terms.
7
+
8
+ Under the following terms:
9
+ - Attribution — You must give appropriate credit , provide a link to the license, and indicate if changes were made . You may do so in any reasonable manner, but not in any way that suggests the licensor endorses you or your use.
10
+ NonCommercial — You may not use the material for commercial purposes .
11
+
12
+ - No additional restrictions — You may not apply legal terms or technological measures that legally restrict others from doing anything the license permits.
13
+ Notices:
14
+
15
+ - You do not have to comply with the license for elements of the material in the public domain or where your use is permitted by an applicable exception or limitation .
16
+
17
+ No warranties are given. The license may not give you all of the permissions necessary for your intended use. For example, other rights such as publicity, privacy, or moral rights may limit how you use the material.
README.md CHANGED
@@ -1,13 +1,175 @@
1
- ---
2
- title: Wan2GP
3
- emoji: 🐨
4
- colorFrom: green
5
- colorTo: gray
6
- sdk: gradio
7
- sdk_version: 5.34.2
8
- app_file: app.py
9
- pinned: false
10
- short_description: Wan2GP
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # WanGP
2
+
3
+ -----
4
+ <p align="center">
5
+ <b>WanGP by DeepBeepMeep : The best Open Source Video Generative Models Accessible to the GPU Poor</b>
6
+ </p>
7
+
8
+ WanGP supports the Wan (and derived models), Hunyuan Video and LTV Video models with:
9
+ - Low VRAM requirements (as low as 6 GB of VRAM is sufficient for certain models)
10
+ - Support for old GPUs (RTX 10XX, 20xx, ...)
11
+ - Very Fast on the latest GPUs
12
+ - Easy to use Full Web based interface
13
+ - Auto download of the required model adapted to your specific architecture
14
+ - Tools integrated to facilitate Video Generation : Mask Editor, Prompt Enhancer, Temporal and Spatial Generation
15
+ - Loras Support to customize each model
16
+ - Queuing system : make your shopping list of videos to generate and come back later
17
+
18
+ **Discord Server to get Help from Other Users and show your Best Videos:** https://discord.gg/g7efUW9jGV
19
+
20
+ **Follow DeepBeepMeep on Twitter/X to get the Latest News**: https://x.com/deepbeepmeep
21
+
22
+ ## 🔥 Latest Updates
23
+ ### June 23 2025: WanGP v6.3, Vace Unleashed. Thought we couldnt squeeze Vace even more ?
24
+ - Multithreaded preprocessing when possible for faster generations
25
+ - Multithreaded frames Lanczos Upsampling as a bonus
26
+ - A new Vace preprocessor : *Flow* to extract fluid motion
27
+ - Multi Vace Controlnets: you can now transfer several properties at the same time. This opens new possibilities to explore, for instance if you transfer *Human Movement* and *Shapes* at the same time for some reasons the lighting of your character will take into account much more the environment of your character.
28
+ - Injected Frames Outpainting, in case you missed it in WanGP 6.21
29
+
30
+ Don't know how to use all of the Vace features ? Check the Vace Guide embedded in WanGP as it has also been updated.
31
+
32
+
33
+ ### June 19 2025: WanGP v6.2, Vace even more Powercharged
34
+ 👋 Have I told you that I am a big fan of Vace ? Here are more goodies to unleash its power:
35
+ - If you ever wanted to watch Star Wars in 4:3, just use the new *Outpainting* feature and it will add the missing bits of image at the top and the bottom of the screen. The best thing is *Outpainting* can be combined with all the other Vace modifications, for instance you can change the main character of your favorite movie at the same time
36
+ - More processing can combined at the same time (for instance the depth process can be applied outside the mask)
37
+ - Upgraded the depth extractor to Depth Anything 2 which is much more detailed
38
+
39
+ As a bonus, I have added two finetunes based on the Safe-Forcing technology (which requires only 4 steps to generate a video): Wan 2.1 text2video Self-Forcing and Vace Self-Forcing. I know there is Lora around but the quality of the Lora is worse (at least with Vace) compared to the full model. Don't hesitate to share your opinion about this on the discord server.
40
+ ### June 17 2025: WanGP v6.1, Vace Powercharged
41
+ 👋 Lots of improvements for Vace the Mother of all Models:
42
+ - masks can now be combined with on the fly processing of a control video, for instance you can extract the motion of a specific person defined by a mask
43
+ - on the fly modification of masks : reversed masks (with the same mask you can modify the background instead of the people covered by the masks), enlarged masks (you can cover more area if for instance the person you are trying to inject is larger than the one in the mask), ...
44
+ - view these modified masks directly inside WanGP during the video generation to check they are really as expected
45
+ - multiple frames injections: multiples frames can be injected at any location of the video
46
+ - expand past videos in on click: just select one generated video to expand it
47
+
48
+ Of course all these new stuff work on all Vace finetunes (including Vace Fusionix).
49
+
50
+ Thanks also to Reevoy24 for adding a Notfication sound at the end of a generation and for fixing the background color of the current generation summary.
51
+
52
+ ### June 12 2025: WanGP v6.0
53
+ 👋 *Finetune models*: You find the 20 models supported by WanGP not sufficient ? Too impatient to wait for the next release to get the support for a newly released model ? Your prayers have been answered: if a new model is compatible with a model architecture supported by WanGP, you can add by yourself the support for this model in WanGP by just creating a finetune model definition. You can then store this model in the cloud (for instance in Huggingface) and the very light finetune definition file can be easily shared with other users. WanGP will download automatically the finetuned model for them.
54
+
55
+ To celebrate the new finetunes support, here are a few finetune gifts (directly accessible from the model selection menu):
56
+ - *Fast Hunyuan Video* : generate model t2v in only 6 steps
57
+ - *Hunyuan Vido AccVideo* : generate model t2v in only 5 steps
58
+ - *Wan FusioniX*: it is a combo of AccVideo / CausVid ans other models and can generate high quality Wan videos in only 8 steps
59
+
60
+ One more thing...
61
+
62
+ The new finetune system can be used to combine complementaty models : what happens when you combine Fusionix Text2Video and Vace Control Net ?
63
+
64
+ You get **Vace FusioniX**: the Ultimate Vace Model, Fast (10 steps, no need for guidance) and with a much better quality Video than the original slower model (despite being the best Control Net out there). Here goes one more finetune...
65
+
66
+ Check the *Finetune Guide* to create finetune models definitions and share them on the WanGP discord server.
67
+
68
+ ### June 11 2025: WanGP v5.5
69
+ 👋 *Hunyuan Video Custom Audio*: it is similar to Hunyuan Video Avatar except there isn't any lower limit on the number of frames and you can use your reference images in a different context than the image itself\
70
+ *Hunyuan Video Custom Edit*: Hunyuan Video Controlnet, use it to do inpainting and replace a person in a video while still keeping his poses. Similar to Vace but less restricted than the Wan models in terms of content...
71
+
72
+
73
+ ### June 6 2025: WanGP v5.41
74
+ 👋 Bonus release: Support for **AccVideo** Lora to speed up x2 Video generations in Wan models. Check the Loras documentation to get the usage instructions of AccVideo.\
75
+ You will need to do a *pip install -r requirements.txt*
76
+
77
+ ### June 6 2025: WanGP v5.4
78
+ 👋 World Exclusive : **Hunyuan Video Avatar** Support ! You won't need 80 GB of VRAM nor 32 GB oF VRAM, just 10 GB of VRAM will be sufficient to generate up to 15s of high quality speech / song driven Video at a high speed with no quality degradation. Support for TeaCache included.\
79
+ Here is a link to the original repo where you will find some very interesting documentation and examples. https://github.com/Tencent-Hunyuan/HunyuanVideo-Avatar. Kudos to the Hunyuan Video Avatar team for the best model of its kind.\
80
+ Also many thanks to Reevoy24 for his repackaging / completing the documentation
81
+
82
+ ### May 28 2025: WanGP v5.31
83
+ 👋 Added **Phantom 14B**, a model that you can use to transfer objects / people in the video. My preference goes to Vace that remains the king of controlnets.
84
+ VACE improvements: Better sliding window transitions, image mask support in Matanyone, new Extend Video feature, and enhanced background removal options.
85
+
86
+ ### May 26, 2025: WanGP v5.3
87
+ 👋 Settings management revolution! Now you can:
88
+ - Select any generated video and click *Use Selected Video Settings* to instantly reuse its configuration
89
+ - Drag & drop videos to automatically extract their settings metadata
90
+ - Export/import settings as JSON files for easy sharing and backup
91
+
92
+ ### May 20, 2025: WanGP v5.2
93
+ 👋 **CausVid support** - Generate videos in just 4-12 steps with the new distilled Wan model! Also added experimental MoviiGen for 1080p generation (20GB+ VRAM required). Check the Loras documentation to get the usage instructions of CausVid.
94
+
95
+ ### May 18, 2025: WanGP v5.1
96
+ 👋 **LTX Video 13B Distilled** - Generate high-quality videos in less than one minute!
97
+
98
+ ### May 17, 2025: WanGP v5.0
99
+ 👋 **One App to Rule Them All!** Added Hunyuan Video and LTX Video support, plus Vace 14B and integrated prompt enhancer.
100
+
101
+ See full changelog: **[Changelog](docs/CHANGELOG.md)**
102
+
103
+ ## 📋 Table of Contents
104
+
105
+ - [🚀 Quick Start](#-quick-start)
106
+ - [📦 Installation](#-installation)
107
+ - [🎯 Usage](#-usage)
108
+ - [📚 Documentation](#-documentation)
109
+ - [🔗 Related Projects](#-related-projects)
110
+
111
+ ## 🚀 Quick Start
112
+
113
+ **One-click installation:** Get started instantly with [Pinokio App](https://pinokio.computer/)
114
+
115
+ **Manual installation:**
116
+ ```bash
117
+ git clone https://github.com/deepbeepmeep/Wan2GP.git
118
+ cd Wan2GP
119
+ conda create -n wan2gp python=3.10.9
120
+ conda activate wan2gp
121
+ pip install torch==2.7.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/test/cu124
122
+ pip install -r requirements.txt
123
+ ```
124
+
125
+ **Run the application:**
126
+ ```bash
127
+ python wgp.py # Text-to-video (default)
128
+ python wgp.py --i2v # Image-to-video
129
+ ```
130
+
131
+ **Update the application:**
132
+ If using Pinokio use Pinokio to update otherwise:
133
+ Get in the directory where WanGP is installed and:
134
+ ```bash
135
+ git pull
136
+ pip install -r requirements.txt
137
+ ```
138
+
139
+ ## 📦 Installation
140
+
141
+ For detailed installation instructions for different GPU generations:
142
+ - **[Installation Guide](docs/INSTALLATION.md)** - Complete setup instructions for RTX 10XX to RTX 50XX
143
+
144
+ ## 🎯 Usage
145
+
146
+ ### Basic Usage
147
+ - **[Getting Started Guide](docs/GETTING_STARTED.md)** - First steps and basic usage
148
+ - **[Models Overview](docs/MODELS.md)** - Available models and their capabilities
149
+
150
+ ### Advanced Features
151
+ - **[Loras Guide](docs/LORAS.md)** - Using and managing Loras for customization
152
+ - **[Finetunes](docs/FINETUNES.md)** - Add manually new models to WanGP
153
+ - **[VACE ControlNet](docs/VACE.md)** - Advanced video control and manipulation
154
+ - **[Command Line Reference](docs/CLI.md)** - All available command line options
155
+
156
+ ## 📚 Documentation
157
+
158
+ - **[Changelog](docs/CHANGELOG.md)** - Latest updates and version history
159
+ - **[Troubleshooting](docs/TROUBLESHOOTING.md)** - Common issues and solutions
160
+
161
+ ## 🔗 Related Projects
162
+
163
+ ### Other Models for the GPU Poor
164
+ - **[HuanyuanVideoGP](https://github.com/deepbeepmeep/HunyuanVideoGP)** - One of the best open source Text to Video generators
165
+ - **[Hunyuan3D-2GP](https://github.com/deepbeepmeep/Hunyuan3D-2GP)** - Image to 3D and text to 3D tool
166
+ - **[FluxFillGP](https://github.com/deepbeepmeep/FluxFillGP)** - Inpainting/outpainting tools based on Flux
167
+ - **[Cosmos1GP](https://github.com/deepbeepmeep/Cosmos1GP)** - Text to world generator and image/video to world
168
+ - **[OminiControlGP](https://github.com/deepbeepmeep/OminiControlGP)** - Flux-derived application for object transfer
169
+ - **[YuE GP](https://github.com/deepbeepmeep/YuEGP)** - Song generator with instruments and singer's voice
170
+
171
+ ---
172
+
173
+ <p align="center">
174
+ Made with ❤️ by DeepBeepMeep
175
+ </p>
assets/comp_effic.png ADDED

Git LFS Details

  • SHA256: b0e225caffb4b31295ad150f95ee852e4c3dde4a00ac8f79a2ff500f2ce26b8d
  • Pointer size: 132 Bytes
  • Size of remote file: 1.79 MB
assets/data_for_diff_stage.jpg ADDED

Git LFS Details

  • SHA256: 59aec08409f2d46b0e640e4e120dc7cca52c08c3de56d026602dbcff1ebf241a
  • Pointer size: 131 Bytes
  • Size of remote file: 528 kB
assets/i2v_res.png ADDED

Git LFS Details

  • SHA256: 6823b3206d8d0cb18d3b5b949dec1217f1178109ba11f14e977b67e1f7b8a248
  • Pointer size: 131 Bytes
  • Size of remote file: 892 kB
assets/logo.png ADDED
assets/t2v_res.jpg ADDED

Git LFS Details

  • SHA256: 91db579092446be2a834bc67721a8e4346936f38c4edb912f459ca3e10f8f439
  • Pointer size: 131 Bytes
  • Size of remote file: 301 kB
assets/vben_vs_sota.png ADDED

Git LFS Details

  • SHA256: 9a0e86ca85046d2675f97984b88b6e74df07bba8a62a31ab8a1aef50d4eda44e
  • Pointer size: 132 Bytes
  • Size of remote file: 1.55 MB
assets/video_dit_arch.jpg ADDED

Git LFS Details

  • SHA256: 195dceec6570289d8b01cc51d2e28a7786216f19de55b23978a52610d1646a66
  • Pointer size: 131 Bytes
  • Size of remote file: 643 kB
assets/video_vae_res.jpg ADDED

Git LFS Details

  • SHA256: d8f9e7f7353848056a615c8ef35ab86ec22976bb46cb27405008b4089701945c
  • Pointer size: 131 Bytes
  • Size of remote file: 213 kB
configs/fantasy.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "WanModel",
3
+ "_diffusers_version": "0.30.0",
4
+ "dim": 5120,
5
+ "eps": 1e-06,
6
+ "ffn_dim": 13824,
7
+ "freq_dim": 256,
8
+ "in_dim": 36,
9
+ "model_type": "i2v",
10
+ "num_heads": 40,
11
+ "num_layers": 40,
12
+ "out_dim": 16,
13
+ "text_len": 512,
14
+ "fantasytalking_dim": 2048
15
+ }
configs/flf2v_720p.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "WanModel",
3
+ "_diffusers_version": "0.30.0",
4
+ "dim": 5120,
5
+ "eps": 1e-06,
6
+ "ffn_dim": 13824,
7
+ "freq_dim": 256,
8
+ "in_dim": 36,
9
+ "model_type": "i2v",
10
+ "num_heads": 40,
11
+ "num_layers": 40,
12
+ "out_dim": 16,
13
+ "text_len": 512
14
+ }
configs/i2v.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "WanModel",
3
+ "_diffusers_version": "0.30.0",
4
+ "dim": 5120,
5
+ "eps": 1e-06,
6
+ "ffn_dim": 13824,
7
+ "freq_dim": 256,
8
+ "in_dim": 36,
9
+ "model_type": "i2v",
10
+ "num_heads": 40,
11
+ "num_layers": 40,
12
+ "out_dim": 16,
13
+ "text_len": 512
14
+ }
configs/i2v_720p.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "WanModel",
3
+ "_diffusers_version": "0.30.0",
4
+ "dim": 5120,
5
+ "eps": 1e-06,
6
+ "ffn_dim": 13824,
7
+ "freq_dim": 256,
8
+ "in_dim": 36,
9
+ "model_type": "i2v",
10
+ "num_heads": 40,
11
+ "num_layers": 40,
12
+ "out_dim": 16,
13
+ "text_len": 512
14
+ }
configs/phantom_1.3B.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "WanModel",
3
+ "_diffusers_version": "0.30.0",
4
+ "dim": 1536,
5
+ "eps": 1e-06,
6
+ "ffn_dim": 8960,
7
+ "freq_dim": 256,
8
+ "in_dim": 16,
9
+ "model_type": "t2v",
10
+ "num_heads": 12,
11
+ "num_layers": 30,
12
+ "out_dim": 16,
13
+ "text_len": 512
14
+ }
configs/phantom_14B.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "WanModel",
3
+ "_diffusers_version": "0.30.0",
4
+ "dim": 5120,
5
+ "eps": 1e-06,
6
+ "ffn_dim": 13824,
7
+ "freq_dim": 256,
8
+ "in_dim": 16,
9
+ "model_type": "t2v",
10
+ "num_heads": 40,
11
+ "num_layers": 40,
12
+ "out_dim": 16,
13
+ "text_len": 512
14
+ }
configs/sky_df_1.3.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "WanModel",
3
+ "_diffusers_version": "0.30.0",
4
+ "dim": 1536,
5
+ "eps": 1e-06,
6
+ "ffn_dim": 8960,
7
+ "freq_dim": 256,
8
+ "in_dim": 16,
9
+ "model_type": "t2v",
10
+ "num_heads": 12,
11
+ "num_layers": 30,
12
+ "out_dim": 16,
13
+ "text_len": 512
14
+ }
configs/sky_df_14B.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "WanModel",
3
+ "_diffusers_version": "0.30.0",
4
+ "dim": 5120,
5
+ "eps": 1e-06,
6
+ "ffn_dim": 13824,
7
+ "freq_dim": 256,
8
+ "in_dim": 16,
9
+ "model_type": "t2v",
10
+ "num_heads": 40,
11
+ "num_layers": 40,
12
+ "out_dim": 16,
13
+ "text_len": 512
14
+ }
configs/t2v.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "WanModel",
3
+ "_diffusers_version": "0.30.0",
4
+ "dim": 5120,
5
+ "eps": 1e-06,
6
+ "ffn_dim": 13824,
7
+ "freq_dim": 256,
8
+ "in_dim": 16,
9
+ "model_type": "t2v",
10
+ "num_heads": 40,
11
+ "num_layers": 40,
12
+ "out_dim": 16,
13
+ "text_len": 512
14
+ }
configs/t2v_1.3B.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "WanModel",
3
+ "_diffusers_version": "0.30.0",
4
+ "dim": 1536,
5
+ "eps": 1e-06,
6
+ "ffn_dim": 8960,
7
+ "freq_dim": 256,
8
+ "in_dim": 16,
9
+ "model_type": "t2v",
10
+ "num_heads": 12,
11
+ "num_layers": 30,
12
+ "out_dim": 16,
13
+ "text_len": 512
14
+ }
configs/vace_1.3B.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "VaceWanModel",
3
+ "_diffusers_version": "0.30.0",
4
+ "dim": 1536,
5
+ "eps": 1e-06,
6
+ "ffn_dim": 8960,
7
+ "freq_dim": 256,
8
+ "in_dim": 16,
9
+ "model_type": "t2v",
10
+ "num_heads": 12,
11
+ "num_layers": 30,
12
+ "out_dim": 16,
13
+ "text_len": 512,
14
+ "vace_layers": [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28],
15
+ "vace_in_dim": 96
16
+ }
configs/vace_14B.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "VaceWanModel",
3
+ "_diffusers_version": "0.30.0",
4
+ "dim": 5120,
5
+ "eps": 1e-06,
6
+ "ffn_dim": 13824,
7
+ "freq_dim": 256,
8
+ "in_dim": 16,
9
+ "model_type": "t2v",
10
+ "num_heads": 40,
11
+ "num_layers": 40,
12
+ "out_dim": 16,
13
+ "text_len": 512,
14
+ "vace_layers": [0, 5, 10, 15, 20, 25, 30, 35],
15
+ "vace_in_dim": 96
16
+ }
docs/CHANGELOG.md ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Changelog
2
+
3
+ ## 🔥 Latest News
4
+ ### June 19 2025: WanGP v6.2, Vace even more Powercharged
5
+ Have I told you that I am a big fan of Vace ? Here are more goodies to unleash its power:
6
+ - If you ever wanted to watch Star Wars in 4:3, just use the new *Outpainting* feature and it will add the missing bits of image at the top and the bottom of the screen. The best thing is *Outpainting* can be combined with all the other Vace modifications, for instance you can change the main character of your favorite movie at the same time
7
+ - More processing can combined at the same time (for instance the depth process can be applied outside the mask)
8
+ - Upgraded the depth extractor to Depth Anything 2 which is much more detailed
9
+
10
+ As a bonus, I have added two finetunes based on the Safe-Forcing technology (which requires only 4 steps to generate a video): Wan 2.1 text2video Self-Forcing and Vace Self-Forcing. I know there is Lora around but the quality of the Lora is worse (at least with Vace) compared to the full model. Don't hesitate to share your opinion about this on the discord server.
11
+
12
+ ### June 17 2025: WanGP v6.1, Vace Powercharged
13
+ Lots of improvements for Vace the Mother of all Models:
14
+ - masks can now be combined with on the fly processing of a control video, for instance you can extract the motion of a specific person defined by a mask
15
+ - on the fly modification of masks : reversed masks (with the same mask you can modify the background instead of the people covered by the masks), enlarged masks (you can cover more area if for instance the person you are trying to inject is larger than the one in the mask), ...
16
+ - view these modified masks directly inside WanGP during the video generation to check they are really as expected
17
+ - multiple frames injections: multiples frames can be injected at any location of the video
18
+ - expand past videos in on click: just select one generated video to expand it
19
+
20
+ Of course all these new stuff work on all Vace finetunes (including Vace Fusionix).
21
+
22
+ Thanks also to Reevoy24 for adding a Notfication sound at the end of a generation and for fixing the background color of the current generation summary.
23
+
24
+ ### June 12 2025: WanGP v6.0
25
+ 👋 *Finetune models*: You find the 20 models supported by WanGP not sufficient ? Too impatient to wait for the next release to get the support for a newly released model ? Your prayers have been answered: if a new model is compatible with a model architecture supported by WanGP, you can add by yourself the support for this model in WanGP by just creating a finetune model definition. You can then store this model in the cloud (for instance in Huggingface) and the very light finetune definition file can be easily shared with other users. WanGP will download automatically the finetuned model for them.
26
+
27
+ To celebrate the new finetunes support, here are a few finetune gifts (directly accessible from the model selection menu):
28
+ - *Fast Hunyuan Video* : generate model t2v in only 6 steps
29
+ - *Hunyuan Vido AccVideo* : generate model t2v in only 5 steps
30
+ - *Wan FusioniX*: it is a combo of AccVideo / CausVid ans other models and can generate high quality Wan videos in only 8 steps
31
+
32
+ One more thing...
33
+
34
+ The new finetune system can be used to combine complementaty models : what happens when you combine Fusionix Text2Video and Vace Control Net ?
35
+
36
+ You get **Vace FusioniX**: the Ultimate Vace Model, Fast (10 steps, no need for guidance) and with a much better quality Video than the original slower model (despite being the best Control Net out there). Here goes one more finetune...
37
+
38
+ Check the *Finetune Guide* to create finetune models definitions and share them on the WanGP discord server.
39
+
40
+ ### June 12 2025: WanGP v5.6
41
+ 👋 *Finetune models*: You find the 20 models supported by WanGP not sufficient ? Too impatient to wait for the next release to get the support for a newly released model ? Your prayers have been answered: if a new model is compatible with a model architecture supported by WanGP, you can add by yourself the support for this model in WanGP by just creating a finetune model definition. You can then store this model in the cloud (for instance in Huggingface) and the very light finetune definition file can be easily shared with other users. WanGP will download automatically the finetuned model for them.
42
+
43
+ To celebrate the new finetunes support, here are a few finetune gifts (directly accessible from the model selection menu):
44
+ - *Fast Hunyuan Video* : generate model t2v in only 6 steps
45
+ - *Hunyuan Vido AccVideo* : generate model t2v in only 5 steps
46
+ - *Wan FusioniX*: it is a combo of AccVideo / CausVid ans other models and can generate high quality Wan videos in only 8 steps
47
+
48
+ One more thing...
49
+
50
+ The new finetune system can be used to combine complementaty models : what happens when you combine Fusionix Text2Video and Vace Control Net ?
51
+
52
+ You get **Vace FusioniX**: the Ultimate Vace Model, Fast (10 steps, no need for guidance) and with a much better quality Video than the original slower model (despite being the best Control Net out there). Here goes one more finetune...
53
+
54
+ Check the *Finetune Guide* to create finetune models definitions and share them on the WanGP discord server.
55
+
56
+ ### June 11 2025: WanGP v5.5
57
+ 👋 *Hunyuan Video Custom Audio*: it is similar to Hunyuan Video Avatar excpet there isn't any lower limit on the number of frames and you can use your reference images in a different context than the image itself\
58
+ *Hunyuan Video Custom Edit*: Hunyuan Video Controlnet, use it to do inpainting and replace a person in a video while still keeping his poses. Similar to Vace but less restricted than the Wan models in terms of content...
59
+
60
+ ### June 6 2025: WanGP v5.41
61
+ 👋 Bonus release: Support for **AccVideo** Lora to speed up x2 Video generations in Wan models. Check the Loras documentation to get the usage instructions of AccVideo.
62
+
63
+ ### June 6 2025: WanGP v5.4
64
+ 👋 World Exclusive : Hunyuan Video Avatar Support ! You won't need 80 GB of VRAM nor 32 GB oF VRAM, just 10 GB of VRAM will be sufficient to generate up to 15s of high quality speech / song driven Video at a high speed with no quality degradation. Support for TeaCache included.
65
+
66
+ ### May 26, 2025: WanGP v5.3
67
+ 👋 Happy with a Video generation and want to do more generations using the same settings but you can't remember what you did or you find it too hard to copy/paste one per one each setting from the file metadata? Rejoice! There are now multiple ways to turn this tedious process into a one click task:
68
+ - Select one Video recently generated in the Video Gallery and click *Use Selected Video Settings*
69
+ - Click *Drop File Here* and select a Video you saved somewhere, if the settings metadata have been saved with the Video you will be able to extract them automatically
70
+ - Click *Export Settings to File* to save on your harddrive the current settings. You will be able to use them later again by clicking *Drop File Here* and select this time a Settings json file
71
+
72
+ ### May 23, 2025: WanGP v5.21
73
+ 👋 Improvements for Vace: better transitions between Sliding Windows, Support for Image masks in Matanyone, new Extend Video for Vace, different types of automated background removal
74
+
75
+ ### May 20, 2025: WanGP v5.2
76
+ 👋 Added support for Wan CausVid which is a distilled Wan model that can generate nice looking videos in only 4 to 12 steps. The great thing is that Kijai (Kudos to him!) has created a CausVid Lora that can be combined with any existing Wan t2v model 14B like Wan Vace 14B. See [LORAS.md](LORAS.md) for instructions on how to use CausVid.
77
+
78
+ Also as an experiment I have added support for the MoviiGen, the first model that claims to be capable of generating 1080p videos (if you have enough VRAM (20GB...) and be ready to wait for a long time...). Don't hesitate to share your impressions on the Discord server.
79
+
80
+ ### May 18, 2025: WanGP v5.1
81
+ 👋 Bonus Day, added LTX Video 13B Distilled: generate in less than one minute, very high quality Videos!
82
+
83
+ ### May 17, 2025: WanGP v5.0
84
+ 👋 One App to Rule Them All! Added support for the other great open source architectures:
85
+ - **Hunyuan Video**: text 2 video (one of the best, if not the best t2v), image 2 video and the recently released Hunyuan Custom (very good identity preservation when injecting a person into a video)
86
+ - **LTX Video 13B** (released last week): very long video support and fast 720p generation. Wan GP version has been greatly optimized and reduced LTX Video VRAM requirements by 4!
87
+
88
+ Also:
89
+ - Added support for the best Control Video Model, released 2 days ago: Vace 14B
90
+ - New Integrated prompt enhancer to increase the quality of the generated videos
91
+
92
+ *You will need one more `pip install -r requirements.txt`*
93
+
94
+ ### May 5, 2025: WanGP v4.5
95
+ 👋 FantasySpeaking model, you can animate a talking head using a voice track. This works not only on people but also on objects. Also better seamless transitions between Vace sliding windows for very long videos. New high quality processing features (mixed 16/32 bits calculation and 32 bits VAE)
96
+
97
+ ### April 27, 2025: WanGP v4.4
98
+ 👋 Phantom model support, very good model to transfer people or objects into video, works quite well at 720p and with the number of steps > 30
99
+
100
+ ### April 25, 2025: WanGP v4.3
101
+ 👋 Added preview mode and support for Sky Reels v2 Diffusion Forcing for high quality "infinite length videos". Note that Skyreel uses causal attention that is only supported by Sdpa attention so even if you choose another type of attention, some of the processes will use Sdpa attention.
102
+
103
+ ### April 18, 2025: WanGP v4.2
104
+ 👋 FLF2V model support, official support from Wan for image2video start and end frames specialized for 720p.
105
+
106
+ ### April 17, 2025: WanGP v4.1
107
+ 👋 Recam Master model support, view a video from a different angle. The video to process must be at least 81 frames long and you should set at least 15 steps denoising to get good results.
108
+
109
+ ### April 13, 2025: WanGP v4.0
110
+ 👋 Lots of goodies for you!
111
+ - A new UI, tabs were replaced by a Dropdown box to easily switch models
112
+ - A new queuing system that lets you stack in a queue as many text2video, image2video tasks, ... as you want. Each task can rely on complete different generation parameters (different number of frames, steps, loras, ...). Many thanks to **Tophness** for being a big contributor on this new feature
113
+ - Temporal upsampling (Rife) and spatial upsampling (Lanczos) for a smoother video (32 fps or 64 fps) and to enlarge your video by x2 or x4. Check these new advanced options.
114
+ - Wan Vace Control Net support: with Vace you can inject in the scene people or objects, animate a person, perform inpainting or outpainting, continue a video, ... See [VACE.md](VACE.md) for introduction guide.
115
+ - Integrated *Matanyone* tool directly inside WanGP so that you can create easily inpainting masks used in Vace
116
+ - Sliding Window generation for Vace, create windows that can last dozens of seconds
117
+ - New optimizations for old generation GPUs: Generate 5s (81 frames, 15 steps) of Vace 1.3B with only 5GB and in only 6 minutes on a RTX 2080Ti and 5s of t2v 14B in less than 10 minutes.
118
+
119
+ ### March 27, 2025
120
+ 👋 Added support for the new Wan Fun InP models (image2video). The 14B Fun InP has probably better end image support but unfortunately existing loras do not work so well with it. The great novelty is the Fun InP image2 1.3B model: Image 2 Video is now accessible to even lower hardware configuration. It is not as good as the 14B models but very impressive for its size. Many thanks to the VideoX-Fun team (https://github.com/aigc-apps/VideoX-Fun)
121
+
122
+ ### March 26, 2025
123
+ 👋 Good news! Official support for RTX 50xx please check the [installation instructions](INSTALLATION.md).
124
+
125
+ ### March 24, 2025: Wan2.1GP v3.2
126
+ 👋
127
+ - Added Classifier-Free Guidance Zero Star. The video should match better the text prompt (especially with text2video) at no performance cost: many thanks to the **CFG Zero * Team**. Don't hesitate to give them a star if you appreciate the results: https://github.com/WeichenFan/CFG-Zero-star
128
+ - Added back support for PyTorch compilation with Loras. It seems it had been broken for some time
129
+ - Added possibility to keep a number of pregenerated videos in the Video Gallery (useful to compare outputs of different settings)
130
+
131
+ *You will need one more `pip install -r requirements.txt`*
132
+
133
+ ### March 19, 2025: Wan2.1GP v3.1
134
+ 👋 Faster launch and RAM optimizations (should require less RAM to run)
135
+
136
+ *You will need one more `pip install -r requirements.txt`*
137
+
138
+ ### March 18, 2025: Wan2.1GP v3.0
139
+ 👋
140
+ - New Tab based interface, you can switch from i2v to t2v conversely without restarting the app
141
+ - Experimental Dual Frames mode for i2v, you can also specify an End frame. It doesn't always work, so you will need a few attempts.
142
+ - You can save default settings in the files *i2v_settings.json* and *t2v_settings.json* that will be used when launching the app (you can also specify the path to different settings files)
143
+ - Slight acceleration with loras
144
+
145
+ *You will need one more `pip install -r requirements.txt`*
146
+
147
+ Many thanks to *Tophness* who created the framework (and did a big part of the work) of the multitabs and saved settings features
148
+
149
+ ### March 18, 2025: Wan2.1GP v2.11
150
+ 👋 Added more command line parameters to prefill the generation settings + customizable output directory and choice of type of metadata for generated videos. Many thanks to *Tophness* for his contributions.
151
+
152
+ *You will need one more `pip install -r requirements.txt` to reflect new dependencies*
153
+
154
+ ### March 18, 2025: Wan2.1GP v2.1
155
+ 👋 More Loras!: added support for 'Safetensors' and 'Replicate' Lora formats.
156
+
157
+ *You will need to refresh the requirements with a `pip install -r requirements.txt`*
158
+
159
+ ### March 17, 2025: Wan2.1GP v2.0
160
+ 👋 The Lora festival continues:
161
+ - Clearer user interface
162
+ - Download 30 Loras in one click to try them all (expand the info section)
163
+ - Very easy to use Loras as now Lora presets can input the subject (or other needed terms) of the Lora so that you don't have to modify manually a prompt
164
+ - Added basic macro prompt language to prefill prompts with different values. With one prompt template, you can generate multiple prompts.
165
+ - New Multiple images prompts: you can now combine any number of images with any number of text prompts (need to launch the app with --multiple-images)
166
+ - New command lines options to launch directly the 1.3B t2v model or the 14B t2v model
167
+
168
+ ### March 14, 2025: Wan2.1GP v1.7
169
+ 👋
170
+ - Lora Fest special edition: very fast loading/unload of loras for those Loras collectors around. You can also now add/remove loras in the Lora folder without restarting the app.
171
+ - Added experimental Skip Layer Guidance (advanced settings), that should improve the image quality at no extra cost. Many thanks to the *AmericanPresidentJimmyCarter* for the original implementation
172
+
173
+ *You will need to refresh the requirements `pip install -r requirements.txt`*
174
+
175
+ ### March 13, 2025: Wan2.1GP v1.6
176
+ 👋 Better Loras support, accelerated loading Loras.
177
+
178
+ *You will need to refresh the requirements `pip install -r requirements.txt`*
179
+
180
+ ### March 10, 2025: Wan2.1GP v1.5
181
+ 👋 Official Teacache support + Smart Teacache (find automatically best parameters for a requested speed multiplier), 10% speed boost with no quality loss, improved lora presets (they can now include prompts and comments to guide the user)
182
+
183
+ ### March 7, 2025: Wan2.1GP v1.4
184
+ 👋 Fix PyTorch compilation, now it is really 20% faster when activated
185
+
186
+ ### March 4, 2025: Wan2.1GP v1.3
187
+ 👋 Support for Image to Video with multiples images for different images/prompts combinations (requires *--multiple-images* switch), and added command line *--preload x* to preload in VRAM x MB of the main diffusion model if you find there is too much unused VRAM and you want to (slightly) accelerate the generation process.
188
+
189
+ *If you upgrade you will need to do a `pip install -r requirements.txt` again.*
190
+
191
+ ### March 4, 2025: Wan2.1GP v1.2
192
+ 👋 Implemented tiling on VAE encoding and decoding. No more VRAM peaks at the beginning and at the end
193
+
194
+ ### March 3, 2025: Wan2.1GP v1.1
195
+ 👋 Added Tea Cache support for faster generations: optimization of kijai's implementation (https://github.com/kijai/ComfyUI-WanVideoWrapper/) of teacache (https://github.com/ali-vilab/TeaCache)
196
+
197
+ ### March 2, 2025: Wan2.1GP by DeepBeepMeep v1
198
+ 👋 Brings:
199
+ - Support for all Wan including the Image to Video model
200
+ - Reduced memory consumption by 2, with possibility to generate more than 10s of video at 720p with a RTX 4090 and 10s of video at 480p with less than 12GB of VRAM. Many thanks to REFLEx (https://github.com/thu-ml/RIFLEx) for their algorithm that allows generating nice looking video longer than 5s.
201
+ - The usual perks: web interface, multiple generations, loras support, sage attention, auto download of models, ...
202
+
203
+ ## Original Wan Releases
204
+
205
+ ### February 25, 2025
206
+ 👋 We've released the inference code and weights of Wan2.1.
207
+
208
+ ### February 27, 2025
209
+ 👋 Wan2.1 has been integrated into [ComfyUI](https://comfyanonymous.github.io/ComfyUI_examples/wan/). Enjoy!
docs/CLI.md ADDED
@@ -0,0 +1,226 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ --vace-1-3B--vace-1-3B# Command Line Reference
2
+
3
+ This document covers all available command line options for WanGP.
4
+
5
+ ## Basic Usage
6
+
7
+ ```bash
8
+ # Default launch
9
+ python wgp.py
10
+
11
+ # Specific model modes
12
+ python wgp.py --i2v # Image-to-video
13
+ python wgp.py --t2v # Text-to-video (default)
14
+ python wgp.py --t2v-14B # 14B text-to-video model
15
+ python wgp.py --t2v-1-3B # 1.3B text-to-video model
16
+ python wgp.py --i2v-14B # 14B image-to-video model
17
+ python wgp.py --i2v-1-3B # Fun InP 1.3B image-to-video model
18
+ python wgp.py --vace-1-3B # VACE ControlNet 1.3B model
19
+ ```
20
+
21
+ ## Model and Performance Options
22
+
23
+ ### Model Configuration
24
+ ```bash
25
+ --quantize-transformer BOOL # Enable/disable transformer quantization (default: True)
26
+ --compile # Enable PyTorch compilation (requires Triton)
27
+ --attention MODE # Force attention mode: sdpa, flash, sage, sage2
28
+ --profile NUMBER # Performance profile 1-5 (default: 4)
29
+ --preload NUMBER # Preload N MB of diffusion model in VRAM
30
+ --fp16 # Force fp16 instead of bf16 models
31
+ --gpu DEVICE # Run on specific GPU device (e.g., "cuda:1")
32
+ ```
33
+
34
+ ### Performance Profiles
35
+ - **Profile 1**: Load entire current model in VRAM and keep all unused models in reserved RAM for fast VRAM tranfers
36
+ - **Profile 2**: Load model parts as needed, keep all unused models in reserved RAM for fast VRAM tranfers
37
+ - **Profile 3**: Load entire current model in VRAM (requires 24GB for 14B model)
38
+ - **Profile 4**: Default and recommended, load model parts as needed, most flexible option
39
+ - **Profile 5**: Minimum RAM usage
40
+
41
+ ### Memory Management
42
+ ```bash
43
+ --perc-reserved-mem-max FLOAT # Max percentage of RAM for reserved memory (< 0.5)
44
+ ```
45
+
46
+ ## Lora Configuration
47
+
48
+ ```bash
49
+ --lora-dir PATH # Path to Wan t2v loras directory
50
+ --lora-dir-i2v PATH # Path to Wan i2v loras directory
51
+ --lora-dir-hunyuan PATH # Path to Hunyuan t2v loras directory
52
+ --lora-dir-hunyuan-i2v PATH # Path to Hunyuan i2v loras directory
53
+ --lora-dir-ltxv PATH # Path to LTX Video loras directory
54
+ --lora-preset PRESET # Load lora preset file (.lset) on startup
55
+ --check-loras # Filter incompatible loras (slower startup)
56
+ ```
57
+
58
+ ## Generation Settings
59
+
60
+ ### Basic Generation
61
+ ```bash
62
+ --seed NUMBER # Set default seed value
63
+ --frames NUMBER # Set default number of frames to generate
64
+ --steps NUMBER # Set default number of denoising steps
65
+ --advanced # Launch with advanced mode enabled
66
+ ```
67
+
68
+ ### Advanced Generation
69
+ ```bash
70
+ --teacache MULTIPLIER # TeaCache speed multiplier: 0, 1.5, 1.75, 2.0, 2.25, 2.5
71
+ ```
72
+
73
+ ## Interface and Server Options
74
+
75
+ ### Server Configuration
76
+ ```bash
77
+ --server-port PORT # Gradio server port (default: 7860)
78
+ --server-name NAME # Gradio server name (default: localhost)
79
+ --listen # Make server accessible on network
80
+ --share # Create shareable HuggingFace URL for remote access
81
+ --open-browser # Open browser automatically when launching
82
+ ```
83
+
84
+ ### Interface Options
85
+ ```bash
86
+ --lock-config # Prevent modifying video engine configuration from interface
87
+ --theme THEME_NAME # UI theme: "default" or "gradio"
88
+ ```
89
+
90
+ ## File and Directory Options
91
+
92
+ ```bash
93
+ --settings PATH # Path to folder containing default settings for all models
94
+ --verbose LEVEL # Information level 0-2 (default: 1)
95
+ ```
96
+
97
+ ## Examples
98
+
99
+ ### Basic Usage Examples
100
+ ```bash
101
+ # Launch with specific model and loras
102
+ python wgp.py --t2v-14B --lora-preset mystyle.lset
103
+
104
+ # High-performance setup with compilation
105
+ python wgp.py --compile --attention sage2 --profile 3
106
+
107
+ # Low VRAM setup
108
+ python wgp.py --t2v-1-3B --profile 4 --attention sdpa
109
+
110
+ # Multiple images with custom lora directory
111
+ python wgp.py --i2v --multiple-images --lora-dir /path/to/shared/loras
112
+ ```
113
+
114
+ ### Server Configuration Examples
115
+ ```bash
116
+ # Network accessible server
117
+ python wgp.py --listen --server-port 8080
118
+
119
+ # Shareable server with custom theme
120
+ python wgp.py --share --theme gradio --open-browser
121
+
122
+ # Locked configuration for public use
123
+ python wgp.py --lock-config --share
124
+ ```
125
+
126
+ ### Advanced Performance Examples
127
+ ```bash
128
+ # Maximum performance (requires high-end GPU)
129
+ python wgp.py --compile --attention sage2 --profile 3 --preload 2000
130
+
131
+ # Optimized for RTX 2080Ti
132
+ python wgp.py --profile 4 --attention sdpa --teacache 2.0
133
+
134
+ # Memory-efficient setup
135
+ python wgp.py --fp16 --profile 4 --perc-reserved-mem-max 0.3
136
+ ```
137
+
138
+ ### TeaCache Configuration
139
+ ```bash
140
+ # Different speed multipliers
141
+ python wgp.py --teacache 1.5 # 1.5x speed, minimal quality loss
142
+ python wgp.py --teacache 2.0 # 2x speed, some quality loss
143
+ python wgp.py --teacache 2.5 # 2.5x speed, noticeable quality loss
144
+ python wgp.py --teacache 0 # Disable TeaCache
145
+ ```
146
+
147
+ ## Attention Modes
148
+
149
+ ### SDPA (Default)
150
+ ```bash
151
+ python wgp.py --attention sdpa
152
+ ```
153
+ - Available by default with PyTorch
154
+ - Good compatibility with all GPUs
155
+ - Moderate performance
156
+
157
+ ### Sage Attention
158
+ ```bash
159
+ python wgp.py --attention sage
160
+ ```
161
+ - Requires Triton installation
162
+ - 30% faster than SDPA
163
+ - Small quality cost
164
+
165
+ ### Sage2 Attention
166
+ ```bash
167
+ python wgp.py --attention sage2
168
+ ```
169
+ - Requires Triton and SageAttention 2.x
170
+ - 40% faster than SDPA
171
+ - Best performance option
172
+
173
+ ### Flash Attention
174
+ ```bash
175
+ python wgp.py --attention flash
176
+ ```
177
+ - May require CUDA kernel compilation
178
+ - Good performance
179
+ - Can be complex to install on Windows
180
+
181
+ ## Troubleshooting Command Lines
182
+
183
+ ### Fallback to Basic Setup
184
+ ```bash
185
+ # If advanced features don't work
186
+ python wgp.py --attention sdpa --profile 4 --fp16
187
+ ```
188
+
189
+ ### Debug Mode
190
+ ```bash
191
+ # Maximum verbosity for troubleshooting
192
+ python wgp.py --verbose 2 --check-loras
193
+ ```
194
+
195
+ ### Memory Issue Debugging
196
+ ```bash
197
+ # Minimal memory usage
198
+ python wgp.py --profile 4 --attention sdpa --perc-reserved-mem-max 0.2
199
+ ```
200
+
201
+
202
+
203
+ ## Configuration Files
204
+
205
+ ### Settings Files
206
+ Load custom settings:
207
+ ```bash
208
+ python wgp.py --settings /path/to/settings/folder
209
+ ```
210
+
211
+ ### Lora Presets
212
+ Create and share lora configurations:
213
+ ```bash
214
+ # Load specific preset
215
+ python wgp.py --lora-preset anime_style.lset
216
+
217
+ # With custom lora directory
218
+ python wgp.py --lora-preset mystyle.lset --lora-dir /shared/loras
219
+ ```
220
+
221
+ ## Environment Variables
222
+
223
+ While not command line options, these environment variables can affect behavior:
224
+ - `CUDA_VISIBLE_DEVICES` - Limit visible GPUs
225
+ - `PYTORCH_CUDA_ALLOC_CONF` - CUDA memory allocation settings
226
+ - `TRITON_CACHE_DIR` - Triton cache directory (for Sage attention)
docs/FINETUNES.md ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # FINETUNES
2
+
3
+ A Finetuned model is model that shares the same architecture of one specific model but has derived weights from this model. Some finetuned models have been created by combining multiple finetuned models.
4
+
5
+ As there are potentially an infinite number of finetunes, specific finetuned models are not known by default by WanGP, however you can create a finetuned model definition that will tell WanGP about the existence of this finetuned model and WanGP will do as usual all the work for you: autodownload the model and build the user interface.
6
+
7
+ Finetune models definitions are light json files that can be easily shared. You can find some of them on the WanGP *discord* server https://discord.gg/g7efUW9jGV
8
+
9
+ Finetuned models have been tested so far with Wan2.1 text2video, Wan2.1 image2video, Hunyuan Video text2video. There isn't currently any support for LTX Video finetunes.
10
+
11
+ ## Create a new Finetune Model Definition
12
+ All the finetune models definitions are json files stored in the **finetunes** sub folder. All the corresponding finetune model weights will be stored in the *ckpts* subfolder and will sit next to the base models.
13
+
14
+ WanGP comes with a few prebuilt finetune models that you can use as starting points and to get an idea of the structure of the definition file.
15
+
16
+ A definition is built from a *settings file* that can contains all the default parameters for a video generation. On top of this file a subtree named **model** contains all the information regarding the finetune (URLs to download model, corresponding base model id, ...).
17
+
18
+ You can obtain a settings file in several ways:
19
+ - In the subfolder **settings**, get the json file that corresponds to the base model of your finetune (see the next section for the list of ids of base models)
20
+ - From the user interface, go to the base model and click **export settings**
21
+
22
+ Here are steps:
23
+ 1) Create a *settings file*
24
+ 2) Add a **model** subtree with the finetune description
25
+ 3) Save this file in the subfolder **finetunes**. The name used for the file will be used as its id. It is a good practise to prefix the name of this file with the base model. For instance for a finetune named **Fast*** based on Hunyuan Text 2 Video model *hunyuan_t2v_fast.json*. In this example the Id is *hunyuan_t2v_fast*.
26
+ 4) Restart WanGP
27
+
28
+ ## Architecture Models Ids
29
+ A finetune is derived from a base model and will inherit all the user interface and corresponding model capabilities, here are Architecture Ids:
30
+ - *t2v*: Wan 2.1 Video text 2
31
+ - *i2v*: Wan 2.1 Video image 2 480p
32
+ - *i2v_720p*: Wan 2.1 Video image 2 720p
33
+ - *vace_14B*: Wan 2.1 Vace 14B
34
+ - *hunyuan*: Hunyuan Video text 2 video
35
+ - *hunyuan_i2v*: Hunyuan Video image 2 video
36
+
37
+ ## The Model Subtree
38
+ - *name* : name of the finetune used to select
39
+ - *architecture* : architecture Id of the base model of the finetune (see previous section)
40
+ - *description*: description of the finetune that will appear at the top
41
+ - *URLs*: URLs of all the finetune versions (quantized / non quantized). WanGP will pick the version that is the closest to the user preferences. You will need to follow a naming convention to help WanGP identify the content of each version (see next section). Right now WanGP supports only 8 bits quantized model that have been quantized using **quanto**. WanGP offers a command switch to build easily such a quantized model (see below). *URLs* can contain also paths to local file to allow testing.
42
+ - *modules*: this a list of modules to be combined with the models referenced by the URLs. A module is a model extension that is merged with a model to expand its capabilities. So far the only module supported is Vace 14B (its id is *vace_14B*). For instance the full Vace model is the fusion of a Wan text 2 video and the Vace module.
43
+ - *preload_URLs* : URLs of files to download no matter what (used to load quantization maps for instance)
44
+ - *auto_quantize*: if set to True and no quantized model URL is provided, WanGP will perform on the fly quantization if the user expects a quantized model
45
+
46
+ Example of **model** subtree
47
+ ```
48
+ "model":
49
+ {
50
+ "name": "Wan text2video FusioniX 14B",
51
+ "architecture" : "t2v",
52
+ "description": "A powerful merged text-to-video model based on the original WAN 2.1 T2V model, enhanced using multiple open-source components and LoRAs to boost motion realism, temporal consistency, and expressive detail. multiple open-source models and LoRAs to boost temporal quality, expressiveness, and motion realism.",
53
+ "URLs": [
54
+ "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/Wan14BT2VFusioniX_fp16.safetensors",
55
+ "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/Wan14BT2VFusioniX_quanto_fp16_int8.safetensors",
56
+ "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/Wan14BT2VFusioniX_quanto_bf16_int8.safetensors"
57
+ ],
58
+ "preload_URLs": [
59
+ ],
60
+ "auto_quantize": true
61
+ },
62
+ ```
63
+
64
+ ## Finetune Model Naming Convention
65
+ If a model is not quantized, it is assumed to be mostly 16 bits (with maybe a few 32 bits weights), so *bf16* or *fp16* should appear somewhere in the name. If you need examples just look at the **ckpts** subfolder, the naming convention for the base models is the same.
66
+
67
+ If a model is quantized the term *quanto* should also be included since WanGP supports for the moment only *quanto* quantized model, most specically you should replace *fp16* by *quanto_fp16_int8* or *bf6* by *quanto_bf16_int8*.
68
+
69
+ Please note it is important than *bf16", "fp16* and *quanto* are all in lower cases letters.
70
+
71
+ ## Creating a Quanto Quantized file
72
+ If you launch the app with the *--save-quantized* switch, WanGP will create a quantized file in the **ckpts** subfolder just after the model has been loaded. Please note that the model will *bf16* or *fp16* quantized depending on what you chose in the configuration menu.
73
+
74
+ 1) Make sure that in the finetune definition json file there is only a URL or filepath that points to the non quantized model
75
+ 2) Launch WanGP *python wgp.py --save-quantized*
76
+ 3) In the configuration menu *Transformer Data Type* property choose either *BF16* of *FP16*
77
+ 4) Launch a video generation (settings used do not matter). As soon as the model is loaded, a new quantized model will be created in the **ckpts** subfolder if it doesn't already exist.
78
+ 5) WanGP will update automatically the finetune definition file with the local path of the newly created quantized file (the list "URLs" will have an extra value such as *"ckpts/finetune_quanto_fp16_int8.safetensors"*
79
+ 6) Remove *--save-quantized*, restart WanGP and select *Scaled Int8 Quantization* in the *Transformer Model Quantization* property
80
+ 7) Launch a new generation and verify in the terminal window that the right quantized model is loaded
81
+ 8) In order to share the finetune definition file you will need to store the fine model weights in the cloud. You can upload them for instance on *Huggingface*. You can now replace in the finetune definition file the local path by a URL (on Huggingface to get the URL of the model file click *Copy download link* when accessing the model properties)
82
+
83
+ You need to create a quantized model specifically for *bf16* or *fp16* as they can not converted on the fly. However there is no need for a non quantized model as they can be converted on the fly while being loaded.
84
+
85
+ Wan models supports both *fp16* and *bf16* data types albeit *fp16* delivers in theory better quality. On the contrary Hunyuan and LTXV supports only *bf16*.
docs/GETTING_STARTED.md ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Getting Started with WanGP
2
+
3
+ This guide will help you get started with WanGP video generation quickly and easily.
4
+
5
+ ## Prerequisites
6
+
7
+ Before starting, ensure you have:
8
+ - A compatible GPU (RTX 10XX or newer recommended)
9
+ - Python 3.10.9 installed
10
+ - At least 6GB of VRAM for basic models
11
+ - Internet connection for model downloads
12
+
13
+ ## Quick Setup
14
+
15
+ ### Option 1: One-Click Installation (Recommended)
16
+ Use [Pinokio App](https://pinokio.computer/) for the easiest installation experience.
17
+
18
+ ### Option 2: Manual Installation
19
+ ```bash
20
+ git clone https://github.com/deepbeepmeep/Wan2GP.git
21
+ cd Wan2GP
22
+ conda create -n wan2gp python=3.10.9
23
+ conda activate wan2gp
24
+ pip install torch==2.6.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/test/cu124
25
+ pip install -r requirements.txt
26
+ ```
27
+
28
+ For detailed installation instructions, see [INSTALLATION.md](INSTALLATION.md).
29
+
30
+ ## First Launch
31
+
32
+ ### Basic Launch
33
+ ```bash
34
+ python wgp.py
35
+ ```
36
+ This launches the WanGP generator with default settings. You will be able to pick from a Drop Down menu which model you want to use.
37
+
38
+ ### Alternative Modes
39
+ ```bash
40
+ python wgp.py --i2v # Wan Image-to-video mode
41
+ python wgp.py --t2v-1-3B # Wan Smaller, faster model
42
+ ```
43
+
44
+ ## Understanding the Interface
45
+
46
+ When you launch WanGP, you'll see a web interface with several sections:
47
+
48
+ ### Main Generation Panel
49
+ - **Model Selection**: Dropdown to choose between different models
50
+ - **Prompt**: Text description of what you want to generate
51
+ - **Generate Button**: Start the video generation process
52
+
53
+ ### Advanced Settings (click checkbox to enable)
54
+ - **Generation Settings**: Steps, guidance, seeds
55
+ - **Loras**: Additional style customizations
56
+ - **Sliding Window**: For longer videos
57
+
58
+ ## Your First Video
59
+
60
+ Let's generate a simple text-to-video:
61
+
62
+ 1. **Launch WanGP**: `python wgp.py`
63
+ 2. **Open Browser**: Navigate to `http://localhost:7860`
64
+ 3. **Enter Prompt**: "A cat walking in a garden"
65
+ 4. **Click Generate**: Wait for the video to be created
66
+ 5. **View Result**: The video will appear in the output section
67
+
68
+ ### Recommended First Settings
69
+ - **Model**: Wan 2.1 text2video 1.3B (faster, lower VRAM)
70
+ - **Frames**: 49 (about 2 seconds)
71
+ - **Steps**: 20 (good balance of speed/quality)
72
+
73
+ ## Model Selection
74
+
75
+ ### Text-to-Video Models
76
+ - **Wan 2.1 T2V 1.3B**: Fastest, lowest VRAM (6GB), good quality
77
+ - **Wan 2.1 T2V 14B**: Best quality, requires more VRAM (12GB+)
78
+ - **Hunyuan Video**: Excellent quality, slower generation
79
+ - **LTX Video**: Good for longer videos
80
+
81
+ ### Image-to-Video Models
82
+ - **Wan Fun InP 1.3B**: Fast image animation
83
+ - **Wan Fun InP 14B**: Higher quality image animation
84
+ - **VACE**: Advanced control over video generation
85
+
86
+ ### Choosing the Right Model
87
+ - **Low VRAM (6-8GB)**: Use 1.3B models
88
+ - **Medium VRAM (10-12GB)**: Use 14B models or Hunyuan
89
+ - **High VRAM (16GB+)**: Any model, longer videos
90
+
91
+ ## Basic Settings Explained
92
+
93
+ ### Generation Settings
94
+ - **Frames**: Number of frames (more = longer video)
95
+ - 25 frames ≈ 1 second
96
+ - 49 frames ≈ 2 seconds
97
+ - 73 frames ≈ 3 seconds
98
+
99
+ - **Steps**: Quality vs Speed tradeoff
100
+ - 15 steps: Fast, lower quality
101
+ - 20 steps: Good balance
102
+ - 30+ steps: High quality, slower
103
+
104
+ - **Guidance Scale**: How closely to follow the prompt
105
+ - 3-5: More creative interpretation
106
+ - 7-10: Closer to prompt description
107
+ - 12+: Very literal interpretation
108
+
109
+ ### Seeds
110
+ - **Random Seed**: Different result each time
111
+ - **Fixed Seed**: Reproducible results
112
+ - **Use same seed + prompt**: Generate variations
113
+
114
+ ## Common Beginner Issues
115
+
116
+ ### "Out of Memory" Errors
117
+ 1. Use smaller models (1.3B instead of 14B)
118
+ 2. Reduce frame count
119
+ 3. Lower resolution in advanced settings
120
+ 4. Enable quantization (usually on by default)
121
+
122
+ ### Slow Generation
123
+ 1. Use 1.3B models for speed
124
+ 2. Reduce number of steps
125
+ 3. Install Sage attention (see [INSTALLATION.md](INSTALLATION.md))
126
+ 4. Enable TeaCache: `python wgp.py --teacache 2.0`
127
+
128
+ ### Poor Quality Results
129
+ 1. Increase number of steps (25-30)
130
+ 2. Improve prompt description
131
+ 3. Use 14B models if you have enough VRAM
132
+ 4. Enable Skip Layer Guidance in advanced settings
133
+
134
+ ## Writing Good Prompts
135
+
136
+ ### Basic Structure
137
+ ```
138
+ [Subject] [Action] [Setting] [Style/Quality modifiers]
139
+ ```
140
+
141
+ ### Examples
142
+ ```
143
+ A red sports car driving through a mountain road at sunset, cinematic, high quality
144
+
145
+ A woman with long hair walking on a beach, waves in the background, realistic, detailed
146
+
147
+ A cat sitting on a windowsill watching rain, cozy atmosphere, soft lighting
148
+ ```
149
+
150
+ ### Tips
151
+ - Be specific about what you want
152
+ - Include style descriptions (cinematic, realistic, etc.)
153
+ - Mention lighting and atmosphere
154
+ - Describe the setting in detail
155
+ - Use quality modifiers (high quality, detailed, etc.)
156
+
157
+ ## Next Steps
158
+
159
+ Once you're comfortable with basic generation:
160
+
161
+ 1. **Explore Advanced Features**:
162
+ - [Loras Guide](LORAS.md) - Customize styles and characters
163
+ - [VACE ControlNet](VACE.md) - Advanced video control
164
+ - [Command Line Options](CLI.md) - Optimize performance
165
+
166
+ 2. **Improve Performance**:
167
+ - Install better attention mechanisms
168
+ - Optimize memory settings
169
+ - Use compilation for speed
170
+
171
+ 3. **Join the Community**:
172
+ - [Discord Server](https://discord.gg/g7efUW9jGV) - Get help and share videos
173
+ - Share your best results
174
+ - Learn from other users
175
+
176
+ ## Troubleshooting First Steps
177
+
178
+ ### Installation Issues
179
+ - Ensure Python 3.10.9 is used
180
+ - Check CUDA version compatibility
181
+ - See [INSTALLATION.md](INSTALLATION.md) for detailed steps
182
+
183
+ ### Generation Issues
184
+ - Check GPU compatibility
185
+ - Verify sufficient VRAM
186
+ - Try basic settings first
187
+ - See [TROUBLESHOOTING.md](TROUBLESHOOTING.md) for specific issues
188
+
189
+ ### Performance Issues
190
+ - Use appropriate model for your hardware
191
+ - Enable performance optimizations
192
+ - Check [CLI.md](CLI.md) for optimization flags
193
+
194
+ Remember: Start simple and gradually explore more advanced features as you become comfortable with the basics!
docs/INSTALLATION.md ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Installation Guide
2
+
3
+ This guide covers installation for different GPU generations and operating systems.
4
+
5
+ ## Requirements
6
+
7
+ - Python 3.10.9
8
+ - Conda or Python venv
9
+ - Compatible GPU (RTX 10XX or newer recommended)
10
+
11
+ ## Installation for RTX 10XX to RTX 40XX (Stable)
12
+
13
+ This installation uses PyTorch 2.6.0 which is well-tested and stable.
14
+
15
+ ### Step 1: Download and Setup Environment
16
+
17
+ ```shell
18
+ # Clone the repository
19
+ git clone https://github.com/deepbeepmeep/Wan2GP.git
20
+ cd Wan2GP
21
+
22
+ # Create Python 3.10.9 environment using conda
23
+ conda create -n wan2gp python=3.10.9
24
+ conda activate wan2gp
25
+ ```
26
+
27
+ ### Step 2: Install PyTorch
28
+
29
+ ```shell
30
+ # Install PyTorch 2.6.0 with CUDA 12.4
31
+ pip install torch==2.6.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/test/cu124
32
+ ```
33
+
34
+ ### Step 3: Install Dependencies
35
+
36
+ ```shell
37
+ # Install core dependencies
38
+ pip install -r requirements.txt
39
+ ```
40
+
41
+ ### Step 4: Optional Performance Optimizations
42
+
43
+ #### Sage Attention (30% faster)
44
+
45
+ ```shell
46
+ # Windows only: Install Triton
47
+ pip install triton-windows
48
+
49
+ # For both Windows and Linux
50
+ pip install sageattention==1.0.6
51
+ ```
52
+
53
+ #### Sage 2 Attention (40% faster)
54
+
55
+ ```shell
56
+ # Windows
57
+ pip install triton-windows
58
+ pip install https://github.com/woct0rdho/SageAttention/releases/download/v2.1.1-windows/sageattention-2.1.1+cu126torch2.6.0-cp310-cp310-win_amd64.whl
59
+
60
+ # Linux (manual compilation required)
61
+ git clone https://github.com/thu-ml/SageAttention
62
+ cd SageAttention
63
+ pip install -e .
64
+ ```
65
+
66
+ #### Flash Attention
67
+
68
+ ```shell
69
+ # May require CUDA kernel compilation on Windows
70
+ pip install flash-attn==2.7.2.post1
71
+ ```
72
+
73
+ ## Installation for RTX 50XX (Beta)
74
+
75
+ RTX 50XX GPUs require PyTorch 2.7.0 (beta). This version may be less stable.
76
+
77
+ ⚠️ **Important:** Use Python 3.10 for compatibility with pip wheels.
78
+
79
+ ### Step 1: Setup Environment
80
+
81
+ ```shell
82
+ # Clone and setup (same as above)
83
+ git clone https://github.com/deepbeepmeep/Wan2GP.git
84
+ cd Wan2GP
85
+ conda create -n wan2gp python=3.10.9
86
+ conda activate wan2gp
87
+ ```
88
+
89
+ ### Step 2: Install PyTorch Beta
90
+
91
+ ```shell
92
+ # Install PyTorch 2.7.0 with CUDA 12.8
93
+ pip install torch==2.7.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/test/cu128
94
+ ```
95
+
96
+ ### Step 3: Install Dependencies
97
+
98
+ ```shell
99
+ pip install -r requirements.txt
100
+ ```
101
+
102
+ ### Step 4: Optional Optimizations for RTX 50XX
103
+
104
+ #### Sage Attention
105
+
106
+ ```shell
107
+ # Windows
108
+ pip install triton-windows
109
+ pip install sageattention==1.0.6
110
+
111
+ # Linux
112
+ pip install sageattention==1.0.6
113
+ ```
114
+
115
+ #### Sage 2 Attention
116
+
117
+ ```shell
118
+ # Windows
119
+ pip install triton-windows
120
+ pip install https://github.com/woct0rdho/SageAttention/releases/download/v2.1.1-windows/sageattention-2.1.1+cu128torch2.7.0-cp310-cp310-win_amd64.whl
121
+
122
+ # Linux (manual compilation)
123
+ git clone https://github.com/thu-ml/SageAttention
124
+ cd SageAttention
125
+ pip install -e .
126
+ ```
127
+
128
+ ## Attention Modes
129
+
130
+ WanGP supports several attention implementations:
131
+
132
+ - **SDPA** (default): Available by default with PyTorch
133
+ - **Sage**: 30% speed boost with small quality cost
134
+ - **Sage2**: 40% speed boost
135
+ - **Flash**: Good performance, may be complex to install on Windows
136
+
137
+ ## Performance Profiles
138
+
139
+ Choose a profile based on your hardware:
140
+
141
+ - **Profile 3 (LowRAM_HighVRAM)**: Loads entire model in VRAM, requires 24GB VRAM for 8-bit quantized 14B model
142
+ - **Profile 4 (LowRAM_LowVRAM)**: Default, loads model parts as needed, slower but lower VRAM requirement
143
+
144
+ ## Troubleshooting
145
+
146
+ ### Sage Attention Issues
147
+
148
+ If Sage attention doesn't work:
149
+
150
+ 1. Check if Triton is properly installed
151
+ 2. Clear Triton cache
152
+ 3. Fallback to SDPA attention:
153
+ ```bash
154
+ python wgp.py --attention sdpa
155
+ ```
156
+
157
+ ### Memory Issues
158
+
159
+ - Use lower resolution or shorter videos
160
+ - Enable quantization (default)
161
+ - Use Profile 4 for lower VRAM usage
162
+ - Consider using 1.3B models instead of 14B models
163
+
164
+ ### GPU Compatibility
165
+
166
+ - RTX 10XX, 20XX: Supported with SDPA attention
167
+ - RTX 30XX, 40XX: Full feature support
168
+ - RTX 50XX: Beta support with PyTorch 2.7.0
169
+
170
+ For more troubleshooting, see [TROUBLESHOOTING.md](TROUBLESHOOTING.md)
docs/LORAS.md ADDED
@@ -0,0 +1,252 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Loras Guide
2
+
3
+ Loras (Low-Rank Adaptations) allow you to customize video generation models by adding specific styles, characters, or effects to your videos.
4
+
5
+ ## Directory Structure
6
+
7
+ Loras are organized in different folders based on the model they're designed for:
8
+
9
+ ### Text-to-Video Models
10
+ - `loras/` - General t2v loras
11
+ - `loras/1.3B/` - Loras specifically for 1.3B models
12
+ - `loras/14B/` - Loras specifically for 14B models
13
+
14
+ ### Image-to-Video Models
15
+ - `loras_i2v/` - Image-to-video loras
16
+
17
+ ### Other Models
18
+ - `loras_hunyuan/` - Hunyuan Video t2v loras
19
+ - `loras_hunyuan_i2v/` - Hunyuan Video i2v loras
20
+ - `loras_ltxv/` - LTX Video loras
21
+
22
+ ## Custom Lora Directory
23
+
24
+ You can specify custom lora directories when launching the app:
25
+
26
+ ```bash
27
+ # Use shared lora directory for both t2v and i2v
28
+ python wgp.py --lora-dir /path/to/shared/loras --lora-dir-i2v /path/to/shared/loras
29
+
30
+ # Specify different directories for different models
31
+ python wgp.py --lora-dir-hunyuan /path/to/hunyuan/loras --lora-dir-ltxv /path/to/ltx/loras
32
+ ```
33
+
34
+ ## Using Loras
35
+
36
+ ### Basic Usage
37
+
38
+ 1. Place your lora files in the appropriate directory
39
+ 2. Launch WanGP
40
+ 3. In the Advanced Tab, select the "Loras" section
41
+ 4. Check the loras you want to activate
42
+ 5. Set multipliers for each lora (default is 1.0)
43
+
44
+ ### Lora Multipliers
45
+
46
+ Multipliers control the strength of each lora's effect:
47
+
48
+ #### Simple Multipliers
49
+ ```
50
+ 1.2 0.8
51
+ ```
52
+ - First lora: 1.2 strength
53
+ - Second lora: 0.8 strength
54
+
55
+ #### Time-based Multipliers
56
+ For dynamic effects over generation steps, use comma-separated values:
57
+ ```
58
+ 0.9,0.8,0.7
59
+ 1.2,1.1,1.0
60
+ ```
61
+ - For 30 steps: steps 0-9 use first value, 10-19 use second, 20-29 use third
62
+ - First lora: 0.9 → 0.8 → 0.7
63
+ - Second lora: 1.2 → 1.1 → 1.0
64
+
65
+ ## Lora Presets
66
+
67
+ Presets are combinations of loras with predefined multipliers and prompts.
68
+
69
+ ### Creating Presets
70
+ 1. Configure your loras and multipliers
71
+ 2. Write a prompt with comments (lines starting with #)
72
+ 3. Save as a preset with `.lset` extension
73
+
74
+ ### Example Preset
75
+ ```
76
+ # Use the keyword "ohnvx" to trigger the lora
77
+ A ohnvx character is driving a car through the city
78
+ ```
79
+
80
+ ### Using Presets
81
+ ```bash
82
+ # Load preset on startup
83
+ python wgp.py --lora-preset mypreset.lset
84
+ ```
85
+
86
+ ### Managing Presets
87
+ - Edit, save, or delete presets directly from the web interface
88
+ - Presets include comments with usage instructions
89
+ - Share `.lset` files with other users
90
+
91
+ ## Supported Formats
92
+
93
+ WanGP supports multiple lora formats:
94
+ - **Safetensors** (.safetensors)
95
+ - **Replicate** format
96
+ - **Standard PyTorch** (.pt, .pth)
97
+
98
+ ## Safe-Forcing lightx2v Lora (Video Generation Accelerator)
99
+
100
+ Safeforcing Lora has been created by Kijai from the Safe-Forcing lightx2v distilled Wan model and can generate videos with only 2 steps and offers also a 2x speed improvement since it doesnt require classifier free guidance. It works on both t2v and i2v models
101
+
102
+ ### Setup Instructions
103
+ 1. Download the Lora:
104
+ ```
105
+ https://huggingface.co/Kijai/WanVideo_comfy/blob/main/Wan21_T2V_14B_lightx2v_cfg_step_distill_lora_rank32.safetensors
106
+ ```
107
+ 2. Place in your `loras/` directory
108
+
109
+ ### Usage
110
+ 1. Select a Wan t2v or i2v model (e.g., Wan 2.1 text2video 13B or Vace 13B)
111
+ 2. Enable Advanced Mode
112
+ 3. In Advanced Generation Tab:
113
+ - Set Guidance Scale = 1
114
+ - Set Shift Scale = 5
115
+ 4. In Advanced Lora Tab:
116
+ - Select the Lora above
117
+ - Set multiplier to 1
118
+ 5. Set generation steps to 2-8
119
+ 6. Generate!
120
+
121
+ ## CausVid Lora (Video Generation Accelerator)
122
+
123
+ CausVid is a distilled Wan model that generates videos in 4-12 steps with 2x speed improvement.
124
+
125
+ ### Setup Instructions
126
+ 1. Download the CausVid Lora:
127
+ ```
128
+ https://huggingface.co/Kijai/WanVideo_comfy/blob/main/Wan21_CausVid_14B_T2V_lora_rank32.safetensors
129
+ ```
130
+ 2. Place in your `loras/` directory
131
+
132
+ ### Usage
133
+ 1. Select a Wan t2v model (e.g., Wan 2.1 text2video 13B or Vace 13B)
134
+ 2. Enable Advanced Mode
135
+ 3. In Advanced Generation Tab:
136
+ - Set Guidance Scale = 1
137
+ - Set Shift Scale = 7
138
+ 4. In Advanced Lora Tab:
139
+ - Select CausVid Lora
140
+ - Set multiplier to 0.3
141
+ 5. Set generation steps to 12
142
+ 6. Generate!
143
+
144
+ ### CausVid Step/Multiplier Relationship
145
+ - **12 steps**: 0.3 multiplier (recommended)
146
+ - **8 steps**: 0.5-0.7 multiplier
147
+ - **4 steps**: 0.8-1.0 multiplier
148
+
149
+ *Note: Lower steps = lower quality (especially motion)*
150
+
151
+
152
+
153
+ ## AccVid Lora (Video Generation Accelerator)
154
+
155
+ AccVid is a distilled Wan model that generates videos with a 2x speed improvement since classifier free guidance is no longer needed (that is cfg = 1).
156
+
157
+ ### Setup Instructions
158
+ 1. Download the AccVid Lora:
159
+
160
+ - for t2v models:
161
+ ```
162
+ https://huggingface.co/Kijai/WanVideo_comfy/blob/main/Wan21_AccVid_T2V_14B_lora_rank32_fp16.safetensors
163
+ ```
164
+
165
+ - for i2v models:
166
+ ```
167
+ https://huggingface.co/Kijai/WanVideo_comfy/blob/main/Wan21_AccVid_I2V_480P_14B_lora_rank32_fp16.safetensors
168
+ ```
169
+
170
+ 2. Place in your `loras/` directory or `loras_i2v/` directory
171
+
172
+ ### Usage
173
+ 1. Select a Wan t2v model (e.g., Wan 2.1 text2video 13B or Vace 13B) or Wan i2v model
174
+ 2. Enable Advanced Mode
175
+ 3. In Advanced Generation Tab:
176
+ - Set Guidance Scale = 1
177
+ - Set Shift Scale = 5
178
+ 4. The number steps remain unchanged compared to what you would use with the original model but it will be two times faster since classifier free guidance is not needed
179
+
180
+
181
+ https://huggingface.co/Kijai/WanVideo_comfy/blob/main/Wan21_T2V_14B_lightx2v_cfg_step_distill_lora_rank32.safetensors
182
+
183
+ ## Performance Tips
184
+
185
+ ### Fast Loading/Unloading
186
+ - Loras can be added/removed without restarting the app
187
+ - Use the "Refresh" button to detect new loras
188
+ - Enable `--check-loras` to filter incompatible loras (slower startup)
189
+
190
+ ### Memory Management
191
+ - Loras are loaded on-demand to save VRAM
192
+ - Multiple loras can be used simultaneously
193
+ - Time-based multipliers don't use extra memory
194
+
195
+ ## Finding Loras
196
+
197
+ ### Sources
198
+ - **[Civitai](https://civitai.com/)** - Large community collection
199
+ - **HuggingFace** - Official and community loras
200
+ - **Discord Server** - Community recommendations
201
+
202
+ ### Creating Loras
203
+ - **Kohya** - Popular training tool
204
+ - **OneTrainer** - Alternative training solution
205
+ - **Custom datasets** - Train on your own content
206
+
207
+ ## Macro System (Advanced)
208
+
209
+ Create multiple prompts from templates using macros:
210
+
211
+ ```
212
+ ! {Subject}="cat","woman","man", {Location}="forest","lake","city", {Possessive}="its","her","his"
213
+ In the video, a {Subject} is presented. The {Subject} is in a {Location} and looks at {Possessive} watch.
214
+ ```
215
+
216
+ This generates:
217
+ 1. "In the video, a cat is presented. The cat is in a forest and looks at its watch."
218
+ 2. "In the video, a woman is presented. The woman is in a lake and looks at her watch."
219
+ 3. "In the video, a man is presented. The man is in a city and looks at his watch."
220
+
221
+ ## Troubleshooting
222
+
223
+ ### Lora Not Working
224
+ 1. Check if lora is compatible with your model size (1.3B vs 14B)
225
+ 2. Verify lora format is supported
226
+ 3. Try different multiplier values
227
+ 4. Check the lora was trained for your model type (t2v vs i2v)
228
+
229
+ ### Performance Issues
230
+ 1. Reduce number of active loras
231
+ 2. Lower multiplier values
232
+ 3. Use `--check-loras` to filter incompatible files
233
+ 4. Clear lora cache if issues persist
234
+
235
+ ### Memory Errors
236
+ 1. Use fewer loras simultaneously
237
+ 2. Reduce model size (use 1.3B instead of 14B)
238
+ 3. Lower video resolution or frame count
239
+ 4. Enable quantization if not already active
240
+
241
+ ## Command Line Options
242
+
243
+ ```bash
244
+ # Lora-related command line options
245
+ --lora-dir path # Path to t2v loras directory
246
+ --lora-dir-i2v path # Path to i2v loras directory
247
+ --lora-dir-hunyuan path # Path to Hunyuan t2v loras
248
+ --lora-dir-hunyuan-i2v path # Path to Hunyuan i2v loras
249
+ --lora-dir-ltxv path # Path to LTX Video loras
250
+ --lora-preset preset # Load preset on startup
251
+ --check-loras # Filter incompatible loras
252
+ ```
docs/MODELS.md ADDED
@@ -0,0 +1,275 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Models Overview
2
+
3
+ WanGP supports multiple video generation models, each optimized for different use cases and hardware configurations.
4
+
5
+
6
+ ## Wan 2.1 Text2Video Models
7
+ Please note that that the term *Text2Video* refers to the underlying Wan architecture but as it has been greatly improved overtime many derived Text2Video models can now generate videos using images.
8
+
9
+ #### Wan 2.1 Text2Video 1.3B
10
+ - **Size**: 1.3 billion parameters
11
+ - **VRAM**: 6GB minimum
12
+ - **Speed**: Fast generation
13
+ - **Quality**: Good quality for the size
14
+ - **Best for**: Quick iterations, lower-end hardware
15
+ - **Command**: `python wgp.py --t2v-1-3B`
16
+
17
+ #### Wan 2.1 Text2Video 14B
18
+ - **Size**: 14 billion parameters
19
+ - **VRAM**: 12GB+ recommended
20
+ - **Speed**: Slower but higher quality
21
+ - **Quality**: Excellent detail and coherence
22
+ - **Best for**: Final production videos
23
+ - **Command**: `python wgp.py --t2v-14B`
24
+
25
+ #### Wan Vace 1.3B
26
+ - **Type**: ControlNet for advanced video control
27
+ - **VRAM**: 6GB minimum
28
+ - **Features**: Motion transfer, object injection, inpainting
29
+ - **Best for**: Advanced video manipulation
30
+ - **Command**: `python wgp.py --vace-1.3B`
31
+
32
+ #### Wan Vace 14B
33
+ - **Type**: Large ControlNet model
34
+ - **VRAM**: 12GB+ recommended
35
+ - **Features**: All Vace features with higher quality
36
+ - **Best for**: Professional video editing workflows
37
+
38
+ #### MoviiGen (Experimental)
39
+ - **Resolution**: Claims 1080p capability
40
+ - **VRAM**: 20GB+ required
41
+ - **Speed**: Very slow generation
42
+ - **Features**: Should generate cinema like video, specialized for 2.1 / 1 ratios
43
+ - **Status**: Experimental, feedback welcome
44
+
45
+ <BR>
46
+
47
+ ## Wan 2.1 Image-to-Video Models
48
+
49
+ #### Wan 2.1 Image2Video 14B
50
+ - **Size**: 14 billion parameters
51
+ - **VRAM**: 12GB+ recommended
52
+ - **Speed**: Slower but higher quality
53
+ - **Quality**: Excellent detail and coherence
54
+ - **Best for**: Most Loras available work with this model
55
+ - **Command**: `python wgp.py --i2v-14B`
56
+
57
+ #### FLF2V
58
+ - **Type**: Start/end frame specialist
59
+ - **Resolution**: Optimized for 720p
60
+ - **Official**: Wan team supported
61
+ - **Use case**: Image-to-video with specific endpoints
62
+
63
+
64
+ <BR>
65
+
66
+ ## Wan 2.1 Specialized Models
67
+
68
+ #### FantasySpeaking
69
+ - **Type**: Talking head animation
70
+ - **Input**: Voice track + image
71
+ - **Works on**: People and objects
72
+ - **Use case**: Lip-sync and voice-driven animation
73
+
74
+ #### Phantom
75
+ - **Type**: Person/object transfer
76
+ - **Resolution**: Works well at 720p
77
+ - **Requirements**: 30+ steps for good results
78
+ - **Best for**: Transferring subjects between videos
79
+
80
+ #### Recam Master
81
+ - **Type**: Viewpoint change
82
+ - **Requirements**: 81+ frame input videos, 15+ denoising steps
83
+ - **Use case**: View same scene from different angles
84
+
85
+ #### Sky Reels v2
86
+ - **Type**: Diffusion Forcing model
87
+ - **Specialty**: "Infinite length" videos
88
+ - **Features**: High quality continuous generation
89
+
90
+
91
+ <BR>
92
+
93
+ ## Wan Fun InP Models
94
+
95
+ #### Wan Fun InP 1.3B
96
+ - **Size**: 1.3 billion parameters
97
+ - **VRAM**: 6GB minimum
98
+ - **Quality**: Good for the size, accessible to lower hardware
99
+ - **Best for**: Entry-level image animation
100
+ - **Command**: `python wgp.py --i2v-1-3B`
101
+
102
+ #### Wan Fun InP 14B
103
+ - **Size**: 14 billion parameters
104
+ - **VRAM**: 12GB+ recommended
105
+ - **Quality**: Better end image support
106
+ - **Limitation**: Existing loras don't work as well
107
+
108
+ <BR>
109
+
110
+ ## Wan Special Loras
111
+ ### Safe-Forcing lightx2v Lora
112
+ - **Type**: Distilled model (Lora implementation)
113
+ - **Speed**: 4-8 steps generation, 2x faster (no classifier free guidance)
114
+ - **Compatible**: Works with t2v and i2v Wan 14B models
115
+ - **Setup**: Requires Safe-Forcing lightx2v Lora (see [LORAS.md](LORAS.md))
116
+
117
+
118
+ ### Causvid Lora
119
+ - **Type**: Distilled model (Lora implementation)
120
+ - **Speed**: 4-12 steps generation, 2x faster (no classifier free guidance)
121
+ - **Compatible**: Works with Wan 14B models
122
+ - **Setup**: Requires CausVid Lora (see [LORAS.md](LORAS.md))
123
+
124
+
125
+ <BR>
126
+
127
+ ## Hunyuan Video Models
128
+
129
+ #### Hunyuan Video Text2Video
130
+ - **Quality**: Among the best open source t2v models
131
+ - **VRAM**: 12GB+ recommended
132
+ - **Speed**: Slower generation but excellent results
133
+ - **Features**: Superior text adherence and video quality, up to 10s of video
134
+ - **Best for**: High-quality text-to-video generation
135
+
136
+ #### Hunyuan Video Custom
137
+ - **Specialty**: Identity preservation
138
+ - **Use case**: Injecting specific people into videos
139
+ - **Quality**: Excellent for character consistency
140
+ - **Best for**: Character-focused video generation
141
+
142
+ #### Hunyuan Video Avater
143
+ - **Specialty**: Generate up to 15s of high quality speech / song driven Video .
144
+ - **Use case**: Injecting specific people into videos
145
+ - **Quality**: Excellent for character consistency
146
+ - **Best for**: Character-focused video generation, Video synchronized with voice
147
+
148
+
149
+ <BR>
150
+
151
+ ## LTX Video Models
152
+
153
+ #### LTX Video 13B
154
+ - **Specialty**: Long video generation
155
+ - **Resolution**: Fast 720p generation
156
+ - **VRAM**: Optimized by WanGP (4x reduction in requirements)
157
+ - **Best for**: Longer duration videos
158
+
159
+ #### LTX Video 13B Distilled
160
+ - **Speed**: Generate in less than one minute
161
+ - **Quality**: Very high quality despite speed
162
+ - **Best for**: Rapid prototyping and quick results
163
+
164
+ <BR>
165
+
166
+ ## Model Selection Guide
167
+
168
+ ### By Hardware (VRAM)
169
+
170
+ #### 6-8GB VRAM
171
+ - Wan 2.1 T2V 1.3B
172
+ - Wan Fun InP 1.3B
173
+ - Wan Vace 1.3B
174
+
175
+ #### 10-12GB VRAM
176
+ - Wan 2.1 T2V 14B
177
+ - Wan Fun InP 14B
178
+ - Hunyuan Video (with optimizations)
179
+ - LTX Video 13B
180
+
181
+ #### 16GB+ VRAM
182
+ - All models supported
183
+ - Longer videos possible
184
+ - Higher resolutions
185
+ - Multiple simultaneous Loras
186
+
187
+ #### 20GB+ VRAM
188
+ - MoviiGen (experimental 1080p)
189
+ - Very long videos
190
+ - Maximum quality settings
191
+
192
+ ### By Use Case
193
+
194
+ #### Quick Prototyping
195
+ 1. **LTX Video 13B Distilled** - Fastest, high quality
196
+ 2. **Wan 2.1 T2V 1.3B** - Fast, good quality
197
+ 3. **CausVid Lora** - 4-12 steps, very fast
198
+
199
+ #### Best Quality
200
+ 1. **Hunyuan Video** - Overall best t2v quality
201
+ 2. **Wan 2.1 T2V 14B** - Excellent Wan quality
202
+ 3. **Wan Vace 14B** - Best for controlled generation
203
+
204
+ #### Advanced Control
205
+ 1. **Wan Vace 14B/1.3B** - Motion transfer, object injection
206
+ 2. **Phantom** - Person/object transfer
207
+ 3. **FantasySpeaking** - Voice-driven animation
208
+
209
+ #### Long Videos
210
+ 1. **LTX Video 13B** - Specialized for length
211
+ 2. **Sky Reels v2** - Infinite length videos
212
+ 3. **Wan Vace + Sliding Windows** - Up to 1 minute
213
+
214
+ #### Lower Hardware
215
+ 1. **Wan Fun InP 1.3B** - Image-to-video
216
+ 2. **Wan 2.1 T2V 1.3B** - Text-to-video
217
+ 3. **Wan Vace 1.3B** - Advanced control
218
+
219
+ <BR>
220
+
221
+ ## Performance Comparison
222
+
223
+ ### Speed (Relative)
224
+ 1. **CausVid Lora** (4-12 steps) - Fastest
225
+ 2. **LTX Video Distilled** - Very fast
226
+ 3. **Wan 1.3B models** - Fast
227
+ 4. **Wan 14B models** - Medium
228
+ 5. **Hunyuan Video** - Slower
229
+ 6. **MoviiGen** - Slowest
230
+
231
+ ### Quality (Subjective)
232
+ 1. **Hunyuan Video** - Highest overall
233
+ 2. **Wan 14B models** - Excellent
234
+ 3. **LTX Video models** - Very good
235
+ 4. **Wan 1.3B models** - Good
236
+ 5. **CausVid** - Good (varies with steps)
237
+
238
+ ### VRAM Efficiency
239
+ 1. **Wan 1.3B models** - Most efficient
240
+ 2. **LTX Video** (with WanGP optimizations)
241
+ 3. **Wan 14B models**
242
+ 4. **Hunyuan Video**
243
+ 5. **MoviiGen** - Least efficient
244
+
245
+ <BR>
246
+
247
+ ## Model Switching
248
+
249
+ WanGP allows switching between models without restarting:
250
+
251
+ 1. Use the dropdown menu in the web interface
252
+ 2. Models are loaded on-demand
253
+ 3. Previous model is unloaded to save VRAM
254
+ 4. Settings are preserved when possible
255
+
256
+ <BR>
257
+
258
+ ## Tips for Model Selection
259
+
260
+ ### First Time Users
261
+ Start with **Wan 2.1 T2V 1.3B** to learn the interface and test your hardware.
262
+
263
+ ### Production Work
264
+ Use **Hunyuan Video** or **Wan 14B** models for final output quality.
265
+
266
+ ### Experimentation
267
+ **CausVid Lora** or **LTX Distilled** for rapid iteration and testing.
268
+
269
+ ### Specialized Tasks
270
+ - **VACE** for advanced control
271
+ - **FantasySpeaking** for talking heads
272
+ - **LTX Video** for long sequences
273
+
274
+ ### Hardware Optimization
275
+ Always start with the largest model your VRAM can handle, then optimize settings for speed vs quality based on your needs.
docs/TROUBLESHOOTING.md ADDED
@@ -0,0 +1,338 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Troubleshooting Guide
2
+
3
+ This guide covers common issues and their solutions when using WanGP.
4
+
5
+ ## Installation Issues
6
+
7
+ ### PyTorch Installation Problems
8
+
9
+ #### CUDA Version Mismatch
10
+ **Problem**: PyTorch can't detect GPU or CUDA errors
11
+ **Solution**:
12
+ ```bash
13
+ # Check your CUDA version
14
+ nvidia-smi
15
+
16
+ # Install matching PyTorch version
17
+ # For CUDA 12.4 (RTX 10XX-40XX)
18
+ pip install torch==2.6.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/test/cu124
19
+
20
+ # For CUDA 12.8 (RTX 50XX)
21
+ pip install torch==2.7.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/test/cu128
22
+ ```
23
+
24
+ #### Python Version Issues
25
+ **Problem**: Package compatibility errors
26
+ **Solution**: Ensure you're using Python 3.10.9
27
+ ```bash
28
+ python --version # Should show 3.10.9
29
+ conda create -n wan2gp python=3.10.9
30
+ ```
31
+
32
+ ### Dependency Installation Failures
33
+
34
+ #### Triton Installation (Windows)
35
+ **Problem**: `pip install triton-windows` fails
36
+ **Solution**:
37
+ 1. Update pip: `pip install --upgrade pip`
38
+ 2. Try pre-compiled wheel
39
+ 3. Fallback to SDPA attention: `python wgp.py --attention sdpa`
40
+
41
+ #### SageAttention Compilation Issues
42
+ **Problem**: SageAttention installation fails
43
+ **Solution**:
44
+ 1. Install Visual Studio Build Tools (Windows)
45
+ 2. Use pre-compiled wheels when available
46
+ 3. Fallback to basic attention modes
47
+
48
+ ## Memory Issues
49
+
50
+ ### CUDA Out of Memory
51
+
52
+ #### During Model Loading
53
+ **Problem**: "CUDA out of memory" when loading model
54
+ **Solutions**:
55
+ ```bash
56
+ # Use smaller model
57
+ python wgp.py --t2v-1-3B
58
+
59
+ # Enable quantization (usually default)
60
+ python wgp.py --quantize-transformer True
61
+
62
+ # Use memory-efficient profile
63
+ python wgp.py --profile 4
64
+
65
+ # Reduce preloaded model size
66
+ python wgp.py --preload 0
67
+ ```
68
+
69
+ #### During Video Generation
70
+ **Problem**: Memory error during generation
71
+ **Solutions**:
72
+ 1. Reduce frame count (shorter videos)
73
+ 2. Lower resolution in advanced settings
74
+ 3. Use lower batch size
75
+ 4. Clear GPU cache between generations
76
+
77
+ ### System RAM Issues
78
+
79
+ #### High RAM Usage
80
+ **Problem**: System runs out of RAM
81
+ **Solutions**:
82
+ ```bash
83
+ # Limit reserved memory
84
+ python wgp.py --perc-reserved-mem-max 0.3
85
+
86
+ # Use minimal RAM profile
87
+ python wgp.py --profile 5
88
+
89
+ # Enable swap file (OS level)
90
+ ```
91
+
92
+ ## Performance Issues
93
+
94
+ ### Slow Generation Speed
95
+
96
+ #### General Optimization
97
+ ```bash
98
+ # Enable compilation (requires Triton)
99
+ python wgp.py --compile
100
+
101
+ # Use faster attention
102
+ python wgp.py --attention sage2
103
+
104
+ # Enable TeaCache
105
+ python wgp.py --teacache 2.0
106
+
107
+ # Use high-performance profile
108
+ python wgp.py --profile 3
109
+ ```
110
+
111
+ #### GPU-Specific Optimizations
112
+
113
+ **RTX 10XX/20XX Series**:
114
+ ```bash
115
+ python wgp.py --attention sdpa --profile 4 --teacache 1.5
116
+ ```
117
+
118
+ **RTX 30XX/40XX Series**:
119
+ ```bash
120
+ python wgp.py --compile --attention sage --profile 3 --teacache 2.0
121
+ ```
122
+
123
+ **RTX 50XX Series**:
124
+ ```bash
125
+ python wgp.py --attention sage --profile 4 --fp16
126
+ ```
127
+
128
+ ### Attention Mechanism Issues
129
+
130
+ #### Sage Attention Not Working
131
+ **Problem**: Sage attention fails to compile or work
132
+ **Diagnostic Steps**:
133
+ 1. Check Triton installation:
134
+ ```python
135
+ import triton
136
+ print(triton.__version__)
137
+ ```
138
+ 2. Clear Triton cache:
139
+ ```bash
140
+ # Windows
141
+ rmdir /s %USERPROFILE%\.triton
142
+ # Linux
143
+ rm -rf ~/.triton
144
+ ```
145
+ 3. Fallback solution:
146
+ ```bash
147
+ python wgp.py --attention sdpa
148
+ ```
149
+
150
+ #### Flash Attention Issues
151
+ **Problem**: Flash attention compilation fails
152
+ **Solution**:
153
+ - Windows: Often requires manual CUDA kernel compilation
154
+ - Linux: Usually works with `pip install flash-attn`
155
+ - Fallback: Use Sage or SDPA attention
156
+
157
+ ## Model-Specific Issues
158
+
159
+ ### Lora Problems
160
+
161
+ #### Loras Not Loading
162
+ **Problem**: Loras don't appear in the interface
163
+ **Solutions**:
164
+ 1. Check file format (should be .safetensors, .pt, or .pth)
165
+ 2. Verify correct directory:
166
+ ```
167
+ loras/ # For t2v models
168
+ loras_i2v/ # For i2v models
169
+ loras_hunyuan/ # For Hunyuan models
170
+ ```
171
+ 3. Click "Refresh" button in interface
172
+ 4. Use `--check-loras` to filter incompatible files
173
+
174
+ #### Lora Compatibility Issues
175
+ **Problem**: Lora causes errors or poor results
176
+ **Solutions**:
177
+ 1. Check model size compatibility (1.3B vs 14B)
178
+ 2. Verify lora was trained for your model type
179
+ 3. Try different multiplier values
180
+ 4. Use `--check-loras` flag to auto-filter
181
+
182
+ ### VACE-Specific Issues
183
+
184
+ #### Poor VACE Results
185
+ **Problem**: VACE generates poor quality or unexpected results
186
+ **Solutions**:
187
+ 1. Enable Skip Layer Guidance
188
+ 2. Use detailed prompts describing all elements
189
+ 3. Ensure proper mask creation with Matanyone
190
+ 4. Check reference image quality
191
+ 5. Use at least 15 steps, preferably 30+
192
+
193
+ #### Matanyone Tool Issues
194
+ **Problem**: Mask creation difficulties
195
+ **Solutions**:
196
+ 1. Use negative point prompts to refine selection
197
+ 2. Create multiple sub-masks and combine them
198
+ 3. Try different background removal options
199
+ 4. Ensure sufficient contrast in source video
200
+
201
+ ## Network and Server Issues
202
+
203
+ ### Gradio Interface Problems
204
+
205
+ #### Port Already in Use
206
+ **Problem**: "Port 7860 is already in use"
207
+ **Solution**:
208
+ ```bash
209
+ # Use different port
210
+ python wgp.py --server-port 7861
211
+
212
+ # Or kill existing process
213
+ # Windows
214
+ netstat -ano | findstr :7860
215
+ taskkill /PID <PID> /F
216
+
217
+ # Linux
218
+ lsof -i :7860
219
+ kill <PID>
220
+ ```
221
+
222
+ #### Interface Not Loading
223
+ **Problem**: Browser shows "connection refused"
224
+ **Solutions**:
225
+ 1. Check if server started successfully
226
+ 2. Try `http://127.0.0.1:7860` instead of `localhost:7860`
227
+ 3. Disable firewall temporarily
228
+ 4. Use `--listen` flag for network access
229
+
230
+ ### Remote Access Issues
231
+
232
+ #### Sharing Not Working
233
+ **Problem**: `--share` flag doesn't create public URL
234
+ **Solutions**:
235
+ 1. Check internet connection
236
+ 2. Try different network
237
+ 3. Use `--listen` with port forwarding
238
+ 4. Check firewall settings
239
+
240
+ ## Quality Issues
241
+
242
+ ### Poor Video Quality
243
+
244
+ #### General Quality Improvements
245
+ 1. Increase number of steps (25-30+)
246
+ 2. Use larger models (14B instead of 1.3B)
247
+ 3. Enable Skip Layer Guidance
248
+ 4. Improve prompt descriptions
249
+ 5. Use higher resolution settings
250
+
251
+ #### Specific Quality Issues
252
+
253
+ **Blurry Videos**:
254
+ - Increase steps
255
+ - Check source image quality (i2v)
256
+ - Reduce TeaCache multiplier
257
+ - Use higher guidance scale
258
+
259
+ **Inconsistent Motion**:
260
+ - Use longer overlap in sliding windows
261
+ - Reduce window size
262
+ - Improve prompt consistency
263
+ - Check control video quality (VACE)
264
+
265
+ **Color Issues**:
266
+ - Check model compatibility
267
+ - Adjust guidance scale
268
+ - Verify input image color space
269
+ - Try different VAE settings
270
+
271
+ ## Advanced Debugging
272
+
273
+ ### Enable Verbose Output
274
+ ```bash
275
+ # Maximum verbosity
276
+ python wgp.py --verbose 2
277
+
278
+ # Check lora compatibility
279
+ python wgp.py --check-loras --verbose 2
280
+ ```
281
+
282
+ ### Memory Debugging
283
+ ```bash
284
+ # Monitor GPU memory
285
+ nvidia-smi -l 1
286
+
287
+ # Reduce memory usage
288
+ python wgp.py --profile 4 --perc-reserved-mem-max 0.2
289
+ ```
290
+
291
+ ### Performance Profiling
292
+ ```bash
293
+ # Test different configurations
294
+ python wgp.py --attention sdpa --profile 4 # Baseline
295
+ python wgp.py --attention sage --profile 3 # Performance
296
+ python wgp.py --compile --teacache 2.0 # Maximum speed
297
+ ```
298
+
299
+ ## Getting Help
300
+
301
+ ### Before Asking for Help
302
+ 1. Check this troubleshooting guide
303
+ 2. Read the relevant documentation:
304
+ - [Installation Guide](INSTALLATION.md)
305
+ - [Getting Started](GETTING_STARTED.md)
306
+ - [Command Line Reference](CLI.md)
307
+ 3. Try basic fallback configuration:
308
+ ```bash
309
+ python wgp.py --attention sdpa --profile 4
310
+ ```
311
+
312
+ ### Community Support
313
+ - **Discord Server**: https://discord.gg/g7efUW9jGV
314
+ - Provide relevant information:
315
+ - GPU model and VRAM amount
316
+ - Python and PyTorch versions
317
+ - Complete error messages
318
+ - Command used to launch WanGP
319
+ - Operating system
320
+
321
+ ### Reporting Bugs
322
+ When reporting issues:
323
+ 1. Include system specifications
324
+ 2. Provide complete error logs
325
+ 3. List the exact steps to reproduce
326
+ 4. Mention any modifications to default settings
327
+ 5. Include command line arguments used
328
+
329
+ ## Emergency Fallback
330
+
331
+ If nothing works, try this minimal configuration:
332
+ ```bash
333
+ # Absolute minimum setup
334
+ python wgp.py --t2v-1-3B --attention sdpa --profile 4 --teacache 0 --fp16
335
+
336
+ # If that fails, check basic PyTorch installation
337
+ python -c "import torch; print(torch.cuda.is_available())"
338
+ ```
docs/VACE.md ADDED
@@ -0,0 +1,214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # VACE ControlNet Guide
2
+
3
+ VACE is a powerful ControlNet that enables Video-to-Video and Reference-to-Video generation. It allows you to inject your own images into output videos, animate characters, perform inpainting/outpainting, and continue existing videos.
4
+
5
+ ## Overview
6
+
7
+ VACE is probably one of the most powerful Wan models available. With it, you can:
8
+ - Inject people or objects into scenes
9
+ - Animate characters
10
+ - Perform video inpainting and outpainting
11
+ - Continue existing videos
12
+ - Transfer motion from one video to another
13
+ - Change the style of scenes while preserving the structure of the scenes
14
+
15
+
16
+ ## Getting Started
17
+
18
+ ### Model Selection
19
+ 1. Select either "Vace 1.3B" or "Vace 13B" from the dropdown menu
20
+ 2. Note: VACE works best with videos up to 7 seconds with the Riflex option enabled
21
+
22
+ You can also use any derived Vace models such as Vace Fusionix or combine Vace with Loras accelerator such as Causvid.
23
+
24
+ ### Input Types
25
+
26
+ #### 1. Control Video
27
+ The Control Video is the source material that contains the instructions about what you want. So Vace expects in the Control Video some visual hints about the type of processing expected: for instance replacing an area by something else, converting an Open Pose wireframe into a human motion, colorizing an Area, transferring the depth of an image area, ...
28
+
29
+ For example, anywhere your control video contains the color 127 (grey), it will be considered as an area to be inpainting and replaced by the content of your text prompt and / or a reference image (see below). Likewise if the frames of a Control Video contains an Open Pose wireframe (basically some straight lines tied together that describes the pose of a person), Vace will automatically turn this Open Pose into a real human based on the text prompt and any reference Images (see below).
30
+
31
+ You can either build yourself the Control Video with the annotators tools provided by the Vace team (see the Vace ressources at the bottom) or you can let WanGP (recommended option) generates on the fly a Vace formatted Control Video based on information you provide.
32
+
33
+ WanGP wil need the following information to generate a Vace Control Video:
34
+ - A *Control Video* : this video shouldn't have been altered by an annotator tool and can be taken straight from youtube or your camera
35
+ - *Control Video Process* : This is the type of process you want to apply on the control video. For instance *Transfer Human Motion* will generate the Open Pose information from your video so that you can transfer this same motion to a generated character. If you want to do only *Spatial Outpainting* or *Temporal Inpainting / Outpainting* you may want to choose the *Keep Unchanged* process.
36
+ - *Area Processed* : you can target the processing to a specific area. For instance even if there are multiple people in the Control Video you may want to replace only one them. If you decide to target an area you will need to provide a *Video Mask* as well. These types of videos can be easily created using the Matanyone tool embedded with WanGP (see the doc of Matanyone below). WanGP can apply different types of process, one the mask and another one on the outside the mask.
37
+
38
+ Another nice thing is that you can combine all effects above with Outpainting since WanGP will create automatically an outpainting area in the Control Video if you ask for this.
39
+
40
+ By default WanGP will ask Vace to generate new frames in the "same spirit" of the control video if the latter is shorter than the number frames that you have requested.
41
+
42
+ Be aware that the Control Video and Video Mask will be before anything happens resampled to the number of frames per second of Vace (usually 16) and resized to the output size you have requested.
43
+ #### 2. Reference Images
44
+ With Reference Images you can inject people or objects of your choice in the Video.
45
+ You can also force Images to appear at a specific frame nos in the Video.
46
+
47
+ If the Reference Image is a person or an object, it is recommended to turn on the background remover that will replace the background by the white color.
48
+ This is not needed for a background image or an injected frame at a specific position.
49
+
50
+ It is recommended to describe injected objects/people explicitly in your text prompt so that Vace can connect the Reference Images to the new generated video and this will increase the chance that you will find your injected people or objects.
51
+
52
+
53
+ ### Understanding Vace Control Video and Mask format
54
+ As stated above WanGP will adapt the Control Video and the Video Mask to meet your instructions. You can preview the first frames of the new Control Video and of the Video Mask in the Generation Preview box (just click a thumbnail) to check that your request has been properly interpreted. You can as well ask WanGP to save in the main folder of WanGP the full generated Control Video and Video Mask by launching the app with the *--save-masks* command.
55
+
56
+ Look at the background colors of both the Control Video and the Video Mask:
57
+ The Mask Video is the most important because depending on the color of its pixels, the Control Video will be interpreted differently. If an area in the Mask is black, the corresponding Control Video area will be kept as is. On the contrary if an area of the Mask is plain white, a Vace process will be applied on this area. If there isn't any Mask Video the Vace process will apply on the whole video frames. The nature of the process itself will depend on what there is in the Control Video for this area.
58
+ - if the area is grey (127) in the Control Video, this area will be replaced by new content based on the text prompt or image references
59
+ - if an area represents a person in the wireframe Open Pose format, it will be replaced by a person animated with motion described by the Open Pose.The appearance of the person will depend on the text prompt or image references
60
+ - if an area contains multiples shades of grey, these will be assumed to represent different levels of image depth and Vace will try to generate new content located at the same depth
61
+
62
+ There are more Vace representations. For all the different mapping please refer the official Vace documentation.
63
+
64
+ ### Other Processing
65
+ Most of the processing below and the ones related to Control Video can be combined together.
66
+ - **Temporal Outpainting**\
67
+ Temporal Outpainting requires an existing *Source Video* or *Control Video* and it amounts to adding missing frames. It is implicit if you use a Source Video that you want to continue (new frames will be added at the end of this Video) or if you provide a Control Video that contains fewer frames than the number that you have requested to generate.
68
+
69
+ - **Temporal Inpainting**\
70
+ With temporal inpainting you are asking Vace to generate missing frames that should exist between existing frames. There are two ways to do that:
71
+ - *Injected Reference Images* : Each Image is injected a position of your choice and Vace will fill the gaps between these frames
72
+ - *Frames to keep in Control Video* : If using a Control Video, you can ask WanGP to hide some of these frames to let Vace generate "alternate frames" for these parts of the Control Video.
73
+
74
+ - **Spatial Outpainting**\
75
+ This feature creates new content to the top, bottom, left or right of existing frames of a Control Video. You can set the amount of content for each direction by specifying a percentage of extra content in relation to the existing frame. Please note that the resulting video will target the resolution you specified. So if this Resolution corresponds to that of your Control Video you may lose details. Therefore it may be relevant to pick a higher resolution with Spatial Outpainting.\
76
+ There are two ways to do Spatial Outpainting:
77
+ - *Injected Reference Frames* : new content will be added around Injected Frames
78
+ - *Control Video* : new content will be added on all the frames of the whole Control Video
79
+
80
+
81
+ ### Example 1 : Replace a Person in one video by another one by keeping the Background
82
+ 1) In Vace, select *Control Video Process*=**Transfer human pose**, *Area processed*=**Masked area**
83
+ 2) In *Matanyone Video Mask Creator*, load your source video and create a mask where you targetted a specific person
84
+ 3) Click *Export to Control Video Input and Video Mask Input* to transfer both the original video that now becomes the *Control Video* and the black & white mask that now defines the *Video Mask Area*
85
+ 4) Back in Vace, in *Reference Image* select **Inject Landscapes / People / Objects** and upload one or several pictures of the new person
86
+ 5) Generate
87
+
88
+ This works also with several people at the same time (you just need to mask several people in *Matanyone*), you can also play with the slider *Expand / Shrink Mask* if the new person is larger than the original one and of course, you can also use the text *Prompt* if you dont want to use an image for the swap.
89
+
90
+
91
+ ### Example 2 : Change the Background behind some characters
92
+ 1) In Vace, select *Control Video Process*=**Inpainting**, *Area processed*=**Non Masked area**
93
+ 2) In *Matanyone Video Mask Creator*, load your source video and create a mask where you targetted the people you want to keep
94
+ 3) Click *Export to Control Video Input and Video Mask Input* to transfer both the original video that now becomes the *Control Video* and the black & white mask that now defines the *Video Mask Area*
95
+ 4) Generate
96
+
97
+ If instead *Control Video Process*=**Depth**, then the background although it will be still different it will have a similar geometry than in the control video
98
+
99
+ ### Example 3 : Outpaint a Video to the Left and Inject a Character in this new area
100
+ 1) In Vace, select *Control Video Process*=**Keep Unchanged**
101
+ 2) *Control Video Outpainting in Percentage* enter the value 40 to the *Left* entry
102
+ 3) In *Reference Image* select **Inject Landscapes / People / Objects** and upload one or several pictures of a person
103
+ 4) Enter the *Prompt* such as "a person is coming from the left" (you will need of course a more accurate description)
104
+ 5) Generate
105
+
106
+
107
+
108
+ ### Creating Face / Object Replacement Masks
109
+ Matanyone is a tool that will generate the Video Mask that needs to be combined with the Control Video. It is very useful as you just need to indicate in the first frame the area you want to mask and it will compute masked areas for the following frames by taking into account the motion.
110
+ 1. Load your video in Matanyone
111
+ 2. Click on the face or object in the first frame
112
+ 3. Validate the mask by clicking **Set Mask**
113
+ 4. Generate a copy of the control video (for easy transfers) and a new mask video by clicking "Generate Video Matting"
114
+ 5. Export to VACE with *Export to Control Video Input and Video Mask Input*
115
+
116
+ ### Advanced Matanyone Tips
117
+ - **Negative Point Prompts**: Remove parts from current selection if the mask goes beyond the desired area
118
+ - **Sub Masks**: Create multiple independent masks, then combine them. This may be useful if you are struggling to select exactly what you want.
119
+
120
+
121
+
122
+ ## Window Sliding for Long Videos
123
+ Generate videos up to 1 minute by merging multiple windows:
124
+ The longer the video the greater the quality degradation. However the effect will be less visible if your generated video reuses mostly non altered control video.
125
+
126
+ When this feature is enabled it is important to keep in mind that every positional argument of Vace (frames positions of *Injected Reference Frames*, *Frames to keep in Control Video*) are related to the first frame of the first Window. This is convenient as changing the size of a sliding window won't have any impact and this allows you define in advance the inject frames for all the windows.
127
+
128
+ Likewise, if you use *Continue Video File* by providing a *Source Video*, this Source Video will be considered as the first window and the positional arguments will be calculated in relation to the first frame of this Source Video. Also the *overlap window size* parameter will correspond to the number of frames used of the Source Video that is temporally outpainted to produce new content.
129
+
130
+ ### How It Works
131
+ - Each window uses the corresponding time segment of the Control Video
132
+ - Example: 0-4s control video → first window, 4-8s → second window, etc.
133
+ - Automatic overlap management ensures smooth transitions
134
+
135
+
136
+ ### Formula
137
+ This formula gives the number of Generated Frames for a specific number of Sliding Windows :
138
+ ```
139
+ Generated Frames = [Nb Windows - 1] × [Window Size - Overlap - Discard] + Window Size
140
+ ```
141
+
142
+ ### Multi-Line Prompts (Experimental)
143
+ If you enable *Text Prompts separated by a Carriage Return will be used for a new Sliding Window*, you can define in advance a different prompt for each window.:
144
+ - Each prompt is separated by a Carriage Return
145
+ - Each line of prompt will be used for a different window
146
+ - If more windows than prompt lines, last line repeats
147
+
148
+ ## Recommended Settings
149
+
150
+ ### Quality Settings
151
+ - **Skip Layer Guidance**: Turn ON with default configuration for better results (useless with FusioniX of Causvid are there is no cfg)
152
+ - **Long Prompts**: Use detailed descriptions, especially for background elements not in reference images
153
+ - **Steps**: Use at least 15 steps for good quality, 30+ for best results if you use the original Vace model. But only 8-10 steps are sufficient with Vace Funsionix or if you use Loras such as Causvid or Self-Forcing.
154
+
155
+ ### Sliding Window Settings
156
+ For very long videos, configure sliding windows properly:
157
+
158
+ - **Window Size**: Set appropriate duration for your content
159
+ - **Overlap Frames**: Long enough for motion continuity, short enough to avoid blur propagation
160
+ - **Discard Last Frames**: Remove at least 4 frames from each window (VACE 1.3B tends to blur final frames)
161
+ - **Add Overlapped Noise**: May or may not reduce quality degradation over time
162
+
163
+ ### Background Removal
164
+ WanGP includes automatic background removal options:
165
+ - Use for reference images containing people/objects
166
+ - **Don't use** this for landscape/setting reference images (the first reference image)
167
+ - If you are not happy with the automatic background removal tool you can use the Image version of Matanyone for a precise background removal
168
+
169
+ ## External Resources
170
+
171
+ ### Official VACE Resources
172
+ - **GitHub**: https://github.com/ali-vilab/VACE/tree/main/vace/gradios
173
+ - **User Guide**: https://github.com/ali-vilab/VACE/blob/main/UserGuide.md
174
+ - **Preprocessors**: Gradio tools for preparing materials
175
+
176
+ ### Recommended External Tools
177
+ - **Annotation Tools**: For creating precise masks
178
+ - **Video Editors**: For preparing control videos
179
+ - **Background Removal**: For cleaning reference images
180
+
181
+ ## Troubleshooting
182
+
183
+ ### Poor Quality Results
184
+ 1. Use longer, more detailed prompts
185
+ 2. Enable Skip Layer Guidance
186
+ 3. Increase number of steps (30+)
187
+ 4. Check reference image quality
188
+ 5. Ensure proper mask creation
189
+
190
+ ### Inconsistent Windows
191
+ 1. Increase overlap frames
192
+ 2. Use consistent prompting across windows
193
+ 3. Add noise to overlapped frames
194
+ 4. Reduce discard frames if losing too much content
195
+
196
+ ### Memory Issues
197
+ 1. Use VACE 1.3B instead of 13B
198
+ 2. Reduce video length or resolution
199
+ 3. Decrease window size
200
+ 4. Enable quantization
201
+
202
+ ### Blurry Results
203
+ 1. Reduce overlap frames
204
+ 2. Increase discard last frames
205
+ 3. Use higher resolution reference images
206
+ 4. Check control video quality
207
+
208
+ ## Tips for Best Results
209
+ 1. **Detailed Prompts**: Describe everything in the scene, especially elements not in reference images
210
+ 2. **Quality Reference Images**: Use high-resolution, well-lit reference images
211
+ 3. **Proper Masking**: Take time to create precise masks with Matanyone
212
+ 4. **Iterative Approach**: Start with short videos, then extend successful results
213
+ 5. **Background Preparation**: Remove complex backgrounds from object/person reference images
214
+ 6. **Consistent Lighting**: Match lighting between reference images and intended scene
fantasytalking/infer.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright Alibaba Inc. All Rights Reserved.
2
+
3
+ from transformers import Wav2Vec2Model, Wav2Vec2Processor
4
+
5
+ from .model import FantasyTalkingAudioConditionModel
6
+ from .utils import get_audio_features
7
+ import gc, torch
8
+
9
+ def parse_audio(audio_path, num_frames, fps = 23, device = "cuda"):
10
+ fantasytalking = FantasyTalkingAudioConditionModel(None, 768, 2048).to(device)
11
+ from mmgp import offload
12
+ from accelerate import init_empty_weights
13
+ from fantasytalking.model import AudioProjModel
14
+
15
+ torch.set_grad_enabled(False)
16
+
17
+ with init_empty_weights():
18
+ proj_model = AudioProjModel( 768, 2048)
19
+ offload.load_model_data(proj_model, "ckpts/fantasy_proj_model.safetensors")
20
+ proj_model.to("cpu").eval().requires_grad_(False)
21
+
22
+ wav2vec_model_dir = "ckpts/wav2vec"
23
+ wav2vec_processor = Wav2Vec2Processor.from_pretrained(wav2vec_model_dir)
24
+ wav2vec = Wav2Vec2Model.from_pretrained(wav2vec_model_dir, device_map="cpu").eval().requires_grad_(False)
25
+ wav2vec.to(device)
26
+ proj_model.to(device)
27
+ audio_wav2vec_fea = get_audio_features( wav2vec, wav2vec_processor, audio_path, fps, num_frames )
28
+
29
+ audio_proj_fea = proj_model(audio_wav2vec_fea)
30
+ pos_idx_ranges = fantasytalking.split_audio_sequence( audio_proj_fea.size(1), num_frames=num_frames )
31
+ audio_proj_split, audio_context_lens = fantasytalking.split_tensor_with_padding( audio_proj_fea, pos_idx_ranges, expand_length=4 ) # [b,21,9+8,768]
32
+ wav2vec, proj_model= None, None
33
+ gc.collect()
34
+ torch.cuda.empty_cache()
35
+
36
+ return audio_proj_split, audio_context_lens
fantasytalking/model.py ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+ from wan.modules.attention import pay_attention
5
+
6
+
7
+ class AudioProjModel(nn.Module):
8
+ def __init__(self, audio_in_dim=1024, cross_attention_dim=1024):
9
+ super().__init__()
10
+ self.cross_attention_dim = cross_attention_dim
11
+ self.proj = torch.nn.Linear(audio_in_dim, cross_attention_dim, bias=False)
12
+ self.norm = torch.nn.LayerNorm(cross_attention_dim)
13
+
14
+ def forward(self, audio_embeds):
15
+ context_tokens = self.proj(audio_embeds)
16
+ context_tokens = self.norm(context_tokens)
17
+ return context_tokens # [B,L,C]
18
+
19
+ class WanCrossAttentionProcessor(nn.Module):
20
+ def __init__(self, context_dim, hidden_dim):
21
+ super().__init__()
22
+
23
+ self.context_dim = context_dim
24
+ self.hidden_dim = hidden_dim
25
+
26
+ self.k_proj = nn.Linear(context_dim, hidden_dim, bias=False)
27
+ self.v_proj = nn.Linear(context_dim, hidden_dim, bias=False)
28
+
29
+ nn.init.zeros_(self.k_proj.weight)
30
+ nn.init.zeros_(self.v_proj.weight)
31
+
32
+ def __call__(
33
+ self,
34
+ q: torch.Tensor,
35
+ audio_proj: torch.Tensor,
36
+ latents_num_frames: int = 21,
37
+ audio_context_lens = None
38
+ ) -> torch.Tensor:
39
+ """
40
+ audio_proj: [B, 21, L3, C]
41
+ audio_context_lens: [B*21].
42
+ """
43
+ b, l, n, d = q.shape
44
+
45
+ if len(audio_proj.shape) == 4:
46
+ audio_q = q.view(b * latents_num_frames, -1, n, d) # [b, 21, l1, n, d]
47
+ ip_key = self.k_proj(audio_proj).view(b * latents_num_frames, -1, n, d)
48
+ ip_value = self.v_proj(audio_proj).view(b * latents_num_frames, -1, n, d)
49
+ qkv_list = [audio_q, ip_key, ip_value]
50
+ del q, audio_q, ip_key, ip_value
51
+ audio_x = pay_attention(qkv_list, k_lens =audio_context_lens) #audio_context_lens
52
+ audio_x = audio_x.view(b, l, n, d)
53
+ audio_x = audio_x.flatten(2)
54
+ elif len(audio_proj.shape) == 3:
55
+ ip_key = self.k_proj(audio_proj).view(b, -1, n, d)
56
+ ip_value = self.v_proj(audio_proj).view(b, -1, n, d)
57
+ qkv_list = [q, ip_key, ip_value]
58
+ del q, ip_key, ip_value
59
+ audio_x = pay_attention(qkv_list, k_lens =audio_context_lens) #audio_context_lens
60
+ audio_x = audio_x.flatten(2)
61
+ return audio_x
62
+
63
+
64
+ class FantasyTalkingAudioConditionModel(nn.Module):
65
+ def __init__(self, wan_dit, audio_in_dim: int, audio_proj_dim: int):
66
+ super().__init__()
67
+
68
+ self.audio_in_dim = audio_in_dim
69
+ self.audio_proj_dim = audio_proj_dim
70
+
71
+ def split_audio_sequence(self, audio_proj_length, num_frames=81):
72
+ """
73
+ Map the audio feature sequence to corresponding latent frame slices.
74
+
75
+ Args:
76
+ audio_proj_length (int): The total length of the audio feature sequence
77
+ (e.g., 173 in audio_proj[1, 173, 768]).
78
+ num_frames (int): The number of video frames in the training data (default: 81).
79
+
80
+ Returns:
81
+ list: A list of [start_idx, end_idx] pairs. Each pair represents the index range
82
+ (within the audio feature sequence) corresponding to a latent frame.
83
+ """
84
+ # Average number of tokens per original video frame
85
+ tokens_per_frame = audio_proj_length / num_frames
86
+
87
+ # Each latent frame covers 4 video frames, and we want the center
88
+ tokens_per_latent_frame = tokens_per_frame * 4
89
+ half_tokens = int(tokens_per_latent_frame / 2)
90
+
91
+ pos_indices = []
92
+ for i in range(int((num_frames - 1) / 4) + 1):
93
+ if i == 0:
94
+ pos_indices.append(0)
95
+ else:
96
+ start_token = tokens_per_frame * ((i - 1) * 4 + 1)
97
+ end_token = tokens_per_frame * (i * 4 + 1)
98
+ center_token = int((start_token + end_token) / 2) - 1
99
+ pos_indices.append(center_token)
100
+
101
+ # Build index ranges centered around each position
102
+ pos_idx_ranges = [[idx - half_tokens, idx + half_tokens] for idx in pos_indices]
103
+
104
+ # Adjust the first range to avoid negative start index
105
+ pos_idx_ranges[0] = [
106
+ -(half_tokens * 2 - pos_idx_ranges[1][0]),
107
+ pos_idx_ranges[1][0],
108
+ ]
109
+
110
+ return pos_idx_ranges
111
+
112
+ def split_tensor_with_padding(self, input_tensor, pos_idx_ranges, expand_length=0):
113
+ """
114
+ Split the input tensor into subsequences based on index ranges, and apply right-side zero-padding
115
+ if the range exceeds the input boundaries.
116
+
117
+ Args:
118
+ input_tensor (Tensor): Input audio tensor of shape [1, L, 768].
119
+ pos_idx_ranges (list): A list of index ranges, e.g. [[-7, 1], [1, 9], ..., [165, 173]].
120
+ expand_length (int): Number of tokens to expand on both sides of each subsequence.
121
+
122
+ Returns:
123
+ sub_sequences (Tensor): A tensor of shape [1, F, L, 768], where L is the length after padding.
124
+ Each element is a padded subsequence.
125
+ k_lens (Tensor): A tensor of shape [F], representing the actual (unpadded) length of each subsequence.
126
+ Useful for ignoring padding tokens in attention masks.
127
+ """
128
+ pos_idx_ranges = [
129
+ [idx[0] - expand_length, idx[1] + expand_length] for idx in pos_idx_ranges
130
+ ]
131
+ sub_sequences = []
132
+ seq_len = input_tensor.size(1) # 173
133
+ max_valid_idx = seq_len - 1 # 172
134
+ k_lens_list = []
135
+ for start, end in pos_idx_ranges:
136
+ # Calculate the fill amount
137
+ pad_front = max(-start, 0)
138
+ pad_back = max(end - max_valid_idx, 0)
139
+
140
+ # Calculate the start and end indices of the valid part
141
+ valid_start = max(start, 0)
142
+ valid_end = min(end, max_valid_idx)
143
+
144
+ # Extract the valid part
145
+ if valid_start <= valid_end:
146
+ valid_part = input_tensor[:, valid_start : valid_end + 1, :]
147
+ else:
148
+ valid_part = input_tensor.new_zeros((1, 0, input_tensor.size(2)))
149
+
150
+ # In the sequence dimension (the 1st dimension) perform padding
151
+ padded_subseq = F.pad(
152
+ valid_part,
153
+ (0, 0, 0, pad_back + pad_front, 0, 0),
154
+ mode="constant",
155
+ value=0,
156
+ )
157
+ k_lens_list.append(padded_subseq.size(-2) - pad_back - pad_front)
158
+
159
+ sub_sequences.append(padded_subseq)
160
+ return torch.stack(sub_sequences, dim=1), torch.tensor(
161
+ k_lens_list, dtype=torch.long
162
+ )
fantasytalking/utils.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright Alibaba Inc. All Rights Reserved.
2
+
3
+ import imageio
4
+ import librosa
5
+ import numpy as np
6
+ import torch
7
+ from PIL import Image
8
+ from tqdm import tqdm
9
+
10
+
11
+ def resize_image_by_longest_edge(image_path, target_size):
12
+ image = Image.open(image_path).convert("RGB")
13
+ width, height = image.size
14
+ scale = target_size / max(width, height)
15
+ new_size = (int(width * scale), int(height * scale))
16
+ return image.resize(new_size, Image.LANCZOS)
17
+
18
+
19
+ def save_video(frames, save_path, fps, quality=9, ffmpeg_params=None):
20
+ writer = imageio.get_writer(
21
+ save_path, fps=fps, quality=quality, ffmpeg_params=ffmpeg_params
22
+ )
23
+ for frame in tqdm(frames, desc="Saving video"):
24
+ frame = np.array(frame)
25
+ writer.append_data(frame)
26
+ writer.close()
27
+
28
+
29
+ def get_audio_features(wav2vec, audio_processor, audio_path, fps, num_frames):
30
+ sr = 16000
31
+ audio_input, sample_rate = librosa.load(audio_path, sr=sr) # 采样率为 16kHz
32
+
33
+ start_time = 0
34
+ # end_time = (0 + (num_frames - 1) * 1) / fps
35
+ end_time = num_frames / fps
36
+
37
+ start_sample = int(start_time * sr)
38
+ end_sample = int(end_time * sr)
39
+
40
+ try:
41
+ audio_segment = audio_input[start_sample:end_sample]
42
+ except:
43
+ audio_segment = audio_input
44
+
45
+ input_values = audio_processor(
46
+ audio_segment, sampling_rate=sample_rate, return_tensors="pt"
47
+ ).input_values.to("cuda")
48
+
49
+ with torch.no_grad():
50
+ fea = wav2vec(input_values).last_hidden_state
51
+
52
+ return fea
finetunes/hunyuan_t2v_accvideo.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": {
3
+ "name": "Hunyuan AccVideo 720p 13B",
4
+ "architecture": "hunyuan",
5
+ "description": " AccVideo is a novel efficient distillation method to accelerate video diffusion models with synthetic datset. Our method is 8.5x faster than HunyuanVideo.",
6
+ "URLs": [
7
+ "https://huggingface.co/DeepBeepMeep/HunyuanVideo/resolve/main/accvideo_hunyuan_video_720_quanto_int8.safetensors"
8
+ ],
9
+ "preload_URLs": [
10
+ ],
11
+ "auto_quantize": true
12
+ },
13
+ "negative_prompt": "",
14
+ "resolution": "832x480",
15
+ "video_length": 81,
16
+ "seed": 42,
17
+ "num_inference_steps": 5,
18
+ "flow_shift": 7,
19
+ "embedded_guidance_scale": 6,
20
+ "repeat_generation": 1,
21
+ "loras_multipliers": "",
22
+ "temporal_upsampling": "",
23
+ "spatial_upsampling": "",
24
+ "RIFLEx_setting": 0,
25
+ "slg_start_perc": 10,
26
+ "slg_end_perc": 90,
27
+ "prompt_enhancer": "",
28
+ "activated_loras": [
29
+ ]
30
+ }
finetunes/hunyuan_t2v_fast.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": {
3
+ "name": "Hunyuan Fast Video 720p 13B",
4
+ "architecture": "hunyuan",
5
+ "description": "Fast Hunyuan is an accelerated HunyuanVideo model. It can sample high quality videos with 6 diffusion steps.",
6
+ "URLs": [
7
+ "https://huggingface.co/DeepBeepMeep/HunyuanVideo/resolve/main/fast_hunyuan_video_720_quanto_int8.safetensors"
8
+ ],
9
+ "preload_URLs": [
10
+ "https://huggingface.co/DeepBeepMeep/HunyuanVideo/resolve/main/fast_hunyuan_video_720_quanto_int8_map.json"
11
+ ],
12
+ "auto_quantize": true
13
+ },
14
+ "negative_prompt": "",
15
+ "resolution": "832x480",
16
+ "video_length": 81,
17
+ "seed": 42,
18
+ "num_inference_steps": 6,
19
+ "flow_shift": 17,
20
+ "embedded_guidance_scale": 6,
21
+ "repeat_generation": 1,
22
+ "loras_multipliers": "",
23
+ "temporal_upsampling": "",
24
+ "spatial_upsampling": "",
25
+ "RIFLEx_setting": 0,
26
+ "slg_start_perc": 10,
27
+ "slg_end_perc": 90,
28
+ "prompt_enhancer": "",
29
+ "activated_loras": [
30
+ ]
31
+ }
finetunes/t2v_fusionix.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model":
3
+ {
4
+ "name": "Wan2.1 text2video FusioniX 14B",
5
+ "architecture" : "t2v",
6
+ "description": "A powerful merged text-to-video model based on the original WAN 2.1 T2V model, enhanced using multiple open-source components and LoRAs to boost motion realism, temporal consistency, and expressive detail.",
7
+ "URLs": [
8
+ "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/Wan14BT2VFusioniX_fp16.safetensors",
9
+ "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/Wan14BT2VFusioniX_quanto_fp16_int8.safetensors",
10
+ "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/Wan14BT2VFusioniX_quanto_bf16_int8.safetensors"
11
+ ],
12
+ "auto_quantize": true
13
+ },
14
+ "negative_prompt": "",
15
+ "prompt": "",
16
+ "resolution": "832x480",
17
+ "video_length": 81,
18
+ "seed": -1,
19
+ "num_inference_steps": 8,
20
+ "guidance_scale": 1,
21
+ "flow_shift": 5,
22
+ "embedded_guidance_scale": 6,
23
+ "repeat_generation": 1,
24
+ "multi_images_gen_type": 0,
25
+ "tea_cache_setting": 0,
26
+ "tea_cache_start_step_perc": 0,
27
+ "loras_multipliers": "",
28
+ "temporal_upsampling": "",
29
+ "spatial_upsampling": "",
30
+ "RIFLEx_setting": 0,
31
+ "slg_switch": 0,
32
+ "slg_start_perc": 10,
33
+ "slg_end_perc": 90,
34
+ "cfg_star_switch": 0,
35
+ "cfg_zero_step": -1,
36
+ "prompt_enhancer": "",
37
+ "activated_loras": []
38
+ }
finetunes/t2v_sf.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": {
3
+ "name": "Wan2.1 text2video Self-Forcing 14B",
4
+ "architecture": "t2v",
5
+ "description": "This model is an advanced text-to-video generation model. This approach allows the model to generate videos with significantly fewer inference steps (4 or 8 steps) and without classifier-free guidance, substantially reducing video generation time while maintaining high quality outputs.",
6
+ "URLs": [
7
+ "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_StepDistill-CfgDistill_14B_bf16.safetensors",
8
+ "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_StepDistill-CfgDistill_14B_quanto_bf16_int8.safetensors",
9
+ "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_StepDistill-CfgDistill_14B_quanto_fp16_int8.safetensors"
10
+ ],
11
+ "author": "https://huggingface.co/lightx2v/Wan2.1-T2V-14B-StepDistill-CfgDistill",
12
+ "auto_quantize": true
13
+ },
14
+ "negative_prompt": "",
15
+ "prompt": "",
16
+ "resolution": "832x480",
17
+ "video_length": 81,
18
+ "seed": -1,
19
+ "num_inference_steps": 4,
20
+ "guidance_scale": 1,
21
+ "flow_shift": 3,
22
+ "embedded_guidance_scale": 6,
23
+ "repeat_generation": 1,
24
+ "multi_images_gen_type": 0,
25
+ "tea_cache_setting": 0,
26
+ "tea_cache_start_step_perc": 0,
27
+ "loras_multipliers": "",
28
+ "temporal_upsampling": "",
29
+ "spatial_upsampling": "",
30
+ "RIFLEx_setting": 0,
31
+ "slg_switch": 0,
32
+ "slg_start_perc": 10,
33
+ "slg_end_perc": 90,
34
+ "cfg_star_switch": 0,
35
+ "cfg_zero_step": -1,
36
+ "prompt_enhancer": "",
37
+ "activated_loras": []
38
+ }
finetunes/vace_14B_fusionix.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": {
3
+ "name": "Vace FusioniX 14B",
4
+ "architecture": "vace_14B",
5
+ "modules": [
6
+ "vace_14B"
7
+ ],
8
+ "description": "Vace control model enhanced using multiple open-source components and LoRAs to boost motion realism, temporal consistency, and expressive detail.",
9
+ "URLs": [
10
+ "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/Wan14BT2VFusioniX_fp16.safetensors",
11
+ "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/Wan14BT2VFusioniX_quanto_bf16_int8.safetensors",
12
+ "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/Wan14BT2VFusioniX_quanto_fp16_int8.safetensors"
13
+ ],
14
+ "auto_quantize": true
15
+ },
16
+ "negative_prompt": "",
17
+ "prompt": "",
18
+ "resolution": "832x480",
19
+ "video_length": 81,
20
+ "seed": -1,
21
+ "num_inference_steps": 10,
22
+ "guidance_scale": 1,
23
+ "flow_shift": 5,
24
+ "embedded_guidance_scale": 6,
25
+ "repeat_generation": 1,
26
+ "multi_images_gen_type": 0,
27
+ "tea_cache_setting": 0,
28
+ "tea_cache_start_step_perc": 0,
29
+ "loras_multipliers": "",
30
+ "temporal_upsampling": "",
31
+ "spatial_upsampling": "",
32
+ "RIFLEx_setting": 0,
33
+ "slg_switch": 0,
34
+ "slg_start_perc": 10,
35
+ "slg_end_perc": 90,
36
+ "cfg_star_switch": 0,
37
+ "cfg_zero_step": -1,
38
+ "prompt_enhancer": "",
39
+ "activated_loras": []
40
+ }
finetunes/vace_14B_sf.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": {
3
+ "name": "Vace Self-Forcing 14B",
4
+ "architecture": "vace_14B",
5
+ "modules": [
6
+ "vace_14B"
7
+ ],
8
+ "description": "This model is a combination of Vace and an advanced text-to-video generation model. This approach allows the model to generate videos with significantly fewer inference steps (4 or 8 steps) and without classifier-free guidance, substantially reducing video generation time while maintaining high quality outputs.",
9
+ "URLs": [
10
+ "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_StepDistill-CfgDistill_14B_bf16.safetensors",
11
+ "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_StepDistill-CfgDistill_14B_quanto_bf16_int8.safetensors",
12
+ "https://huggingface.co/DeepBeepMeep/Wan2.1/resolve/main/wan2.1_StepDistill-CfgDistill_14B_quanto_fp16_int8.safetensors"
13
+ ],
14
+ "author": "https://huggingface.co/lightx2v/Wan2.1-T2V-14B-StepDistill-CfgDistill",
15
+ "auto_quantize": true
16
+ },
17
+ "negative_prompt": "",
18
+ "prompt": "",
19
+ "resolution": "832x480",
20
+ "video_length": 81,
21
+ "seed": -1,
22
+ "num_inference_steps": 4,
23
+ "guidance_scale": 1,
24
+ "flow_shift": 3,
25
+ "embedded_guidance_scale": 6,
26
+ "repeat_generation": 1,
27
+ "multi_images_gen_type": 0,
28
+ "tea_cache_setting": 0,
29
+ "tea_cache_start_step_perc": 0,
30
+ "loras_multipliers": "",
31
+ "temporal_upsampling": "",
32
+ "spatial_upsampling": "",
33
+ "RIFLEx_setting": 0,
34
+ "slg_switch": 0,
35
+ "slg_start_perc": 10,
36
+ "slg_end_perc": 90,
37
+ "cfg_star_switch": 0,
38
+ "cfg_zero_step": -1,
39
+ "prompt_enhancer": "",
40
+ "activated_loras": []
41
+ }
hyvideo/__init__.py ADDED
File without changes
hyvideo/config.py ADDED
@@ -0,0 +1,534 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ from .constants import *
3
+ import re
4
+ from .modules.models import HUNYUAN_VIDEO_CONFIG
5
+
6
+
7
+ def parse_args(namespace=None):
8
+ parser = argparse.ArgumentParser(description="HunyuanVideo inference script")
9
+
10
+ parser = add_network_args(parser)
11
+ parser = add_extra_models_args(parser)
12
+ parser = add_denoise_schedule_args(parser)
13
+ parser = add_inference_args(parser)
14
+ parser = add_parallel_args(parser)
15
+
16
+ args = parser.parse_args(namespace=namespace)
17
+ args = sanity_check_args(args)
18
+
19
+ return args
20
+
21
+
22
+ def add_network_args(parser: argparse.ArgumentParser):
23
+ group = parser.add_argument_group(title="HunyuanVideo network args")
24
+
25
+
26
+ group.add_argument(
27
+ "--quantize-transformer",
28
+ action="store_true",
29
+ help="On the fly 'transformer' quantization"
30
+ )
31
+
32
+
33
+ group.add_argument(
34
+ "--lora-dir-i2v",
35
+ type=str,
36
+ default="loras_i2v",
37
+ help="Path to a directory that contains Loras for i2v"
38
+ )
39
+
40
+
41
+ group.add_argument(
42
+ "--lora-dir",
43
+ type=str,
44
+ default="",
45
+ help="Path to a directory that contains Loras"
46
+ )
47
+
48
+
49
+ group.add_argument(
50
+ "--lora-preset",
51
+ type=str,
52
+ default="",
53
+ help="Lora preset to preload"
54
+ )
55
+
56
+ # group.add_argument(
57
+ # "--lora-preset-i2v",
58
+ # type=str,
59
+ # default="",
60
+ # help="Lora preset to preload for i2v"
61
+ # )
62
+
63
+ group.add_argument(
64
+ "--profile",
65
+ type=str,
66
+ default=-1,
67
+ help="Profile No"
68
+ )
69
+
70
+ group.add_argument(
71
+ "--verbose",
72
+ type=str,
73
+ default=1,
74
+ help="Verbose level"
75
+ )
76
+
77
+ group.add_argument(
78
+ "--server-port",
79
+ type=str,
80
+ default=0,
81
+ help="Server port"
82
+ )
83
+
84
+ group.add_argument(
85
+ "--server-name",
86
+ type=str,
87
+ default="",
88
+ help="Server name"
89
+ )
90
+
91
+ group.add_argument(
92
+ "--open-browser",
93
+ action="store_true",
94
+ help="open browser"
95
+ )
96
+
97
+ group.add_argument(
98
+ "--t2v",
99
+ action="store_true",
100
+ help="text to video mode"
101
+ )
102
+
103
+ group.add_argument(
104
+ "--i2v",
105
+ action="store_true",
106
+ help="image to video mode"
107
+ )
108
+
109
+ group.add_argument(
110
+ "--compile",
111
+ action="store_true",
112
+ help="Enable pytorch compilation"
113
+ )
114
+
115
+ group.add_argument(
116
+ "--fast",
117
+ action="store_true",
118
+ help="use Fast HunyuanVideo model"
119
+ )
120
+
121
+ group.add_argument(
122
+ "--fastest",
123
+ action="store_true",
124
+ help="activate the best config"
125
+ )
126
+
127
+ group.add_argument(
128
+ "--attention",
129
+ type=str,
130
+ default="",
131
+ help="attention mode"
132
+ )
133
+
134
+ group.add_argument(
135
+ "--vae-config",
136
+ type=str,
137
+ default="",
138
+ help="vae config mode"
139
+ )
140
+
141
+ parser.add_argument(
142
+ "--share",
143
+ action="store_true",
144
+ help="Create a shared URL to access webserver remotely"
145
+ )
146
+
147
+ parser.add_argument(
148
+ "--lock-config",
149
+ action="store_true",
150
+ help="Prevent modifying the configuration from the web interface"
151
+ )
152
+
153
+ parser.add_argument(
154
+ "--preload",
155
+ type=str,
156
+ default="0",
157
+ help="Megabytes of the diffusion model to preload in VRAM"
158
+ )
159
+
160
+ parser.add_argument(
161
+ "--multiple-images",
162
+ action="store_true",
163
+ help="Allow inputting multiple images with image to video"
164
+ )
165
+
166
+
167
+ # Main model
168
+ group.add_argument(
169
+ "--model",
170
+ type=str,
171
+ choices=list(HUNYUAN_VIDEO_CONFIG.keys()),
172
+ default="HYVideo-T/2-cfgdistill",
173
+ )
174
+ group.add_argument(
175
+ "--latent-channels",
176
+ type=str,
177
+ default=16,
178
+ help="Number of latent channels of DiT. If None, it will be determined by `vae`. If provided, "
179
+ "it still needs to match the latent channels of the VAE model.",
180
+ )
181
+ group.add_argument(
182
+ "--precision",
183
+ type=str,
184
+ default="bf16",
185
+ choices=PRECISIONS,
186
+ help="Precision mode. Options: fp32, fp16, bf16. Applied to the backbone model and optimizer.",
187
+ )
188
+
189
+ # RoPE
190
+ group.add_argument(
191
+ "--rope-theta", type=int, default=256, help="Theta used in RoPE."
192
+ )
193
+ return parser
194
+
195
+
196
+ def add_extra_models_args(parser: argparse.ArgumentParser):
197
+ group = parser.add_argument_group(
198
+ title="Extra models args, including vae, text encoders and tokenizers)"
199
+ )
200
+
201
+ # - VAE
202
+ group.add_argument(
203
+ "--vae",
204
+ type=str,
205
+ default="884-16c-hy",
206
+ choices=list(VAE_PATH),
207
+ help="Name of the VAE model.",
208
+ )
209
+ group.add_argument(
210
+ "--vae-precision",
211
+ type=str,
212
+ default="fp16",
213
+ choices=PRECISIONS,
214
+ help="Precision mode for the VAE model.",
215
+ )
216
+ group.add_argument(
217
+ "--vae-tiling",
218
+ action="store_true",
219
+ help="Enable tiling for the VAE model to save GPU memory.",
220
+ )
221
+ group.set_defaults(vae_tiling=True)
222
+
223
+ group.add_argument(
224
+ "--text-encoder",
225
+ type=str,
226
+ default="llm",
227
+ choices=list(TEXT_ENCODER_PATH),
228
+ help="Name of the text encoder model.",
229
+ )
230
+ group.add_argument(
231
+ "--text-encoder-precision",
232
+ type=str,
233
+ default="fp16",
234
+ choices=PRECISIONS,
235
+ help="Precision mode for the text encoder model.",
236
+ )
237
+ group.add_argument(
238
+ "--text-states-dim",
239
+ type=int,
240
+ default=4096,
241
+ help="Dimension of the text encoder hidden states.",
242
+ )
243
+ group.add_argument(
244
+ "--text-len", type=int, default=256, help="Maximum length of the text input."
245
+ )
246
+ group.add_argument(
247
+ "--tokenizer",
248
+ type=str,
249
+ default="llm",
250
+ choices=list(TOKENIZER_PATH),
251
+ help="Name of the tokenizer model.",
252
+ )
253
+ group.add_argument(
254
+ "--prompt-template",
255
+ type=str,
256
+ default="dit-llm-encode",
257
+ choices=PROMPT_TEMPLATE,
258
+ help="Image prompt template for the decoder-only text encoder model.",
259
+ )
260
+ group.add_argument(
261
+ "--prompt-template-video",
262
+ type=str,
263
+ default="dit-llm-encode-video",
264
+ choices=PROMPT_TEMPLATE,
265
+ help="Video prompt template for the decoder-only text encoder model.",
266
+ )
267
+ group.add_argument(
268
+ "--hidden-state-skip-layer",
269
+ type=int,
270
+ default=2,
271
+ help="Skip layer for hidden states.",
272
+ )
273
+ group.add_argument(
274
+ "--apply-final-norm",
275
+ action="store_true",
276
+ help="Apply final normalization to the used text encoder hidden states.",
277
+ )
278
+
279
+ # - CLIP
280
+ group.add_argument(
281
+ "--text-encoder-2",
282
+ type=str,
283
+ default="clipL",
284
+ choices=list(TEXT_ENCODER_PATH),
285
+ help="Name of the second text encoder model.",
286
+ )
287
+ group.add_argument(
288
+ "--text-encoder-precision-2",
289
+ type=str,
290
+ default="fp16",
291
+ choices=PRECISIONS,
292
+ help="Precision mode for the second text encoder model.",
293
+ )
294
+ group.add_argument(
295
+ "--text-states-dim-2",
296
+ type=int,
297
+ default=768,
298
+ help="Dimension of the second text encoder hidden states.",
299
+ )
300
+ group.add_argument(
301
+ "--tokenizer-2",
302
+ type=str,
303
+ default="clipL",
304
+ choices=list(TOKENIZER_PATH),
305
+ help="Name of the second tokenizer model.",
306
+ )
307
+ group.add_argument(
308
+ "--text-len-2",
309
+ type=int,
310
+ default=77,
311
+ help="Maximum length of the second text input.",
312
+ )
313
+
314
+ return parser
315
+
316
+
317
+ def add_denoise_schedule_args(parser: argparse.ArgumentParser):
318
+ group = parser.add_argument_group(title="Denoise schedule args")
319
+
320
+ group.add_argument(
321
+ "--denoise-type",
322
+ type=str,
323
+ default="flow",
324
+ help="Denoise type for noised inputs.",
325
+ )
326
+
327
+ # Flow Matching
328
+ group.add_argument(
329
+ "--flow-shift",
330
+ type=float,
331
+ default=7.0,
332
+ help="Shift factor for flow matching schedulers.",
333
+ )
334
+ group.add_argument(
335
+ "--flow-reverse",
336
+ action="store_true",
337
+ help="If reverse, learning/sampling from t=1 -> t=0.",
338
+ )
339
+ group.add_argument(
340
+ "--flow-solver",
341
+ type=str,
342
+ default="euler",
343
+ help="Solver for flow matching.",
344
+ )
345
+ group.add_argument(
346
+ "--use-linear-quadratic-schedule",
347
+ action="store_true",
348
+ help="Use linear quadratic schedule for flow matching."
349
+ "Following MovieGen (https://ai.meta.com/static-resource/movie-gen-research-paper)",
350
+ )
351
+ group.add_argument(
352
+ "--linear-schedule-end",
353
+ type=int,
354
+ default=25,
355
+ help="End step for linear quadratic schedule for flow matching.",
356
+ )
357
+
358
+ return parser
359
+
360
+
361
+ def add_inference_args(parser: argparse.ArgumentParser):
362
+ group = parser.add_argument_group(title="Inference args")
363
+
364
+ # ======================== Model loads ========================
365
+ group.add_argument(
366
+ "--model-base",
367
+ type=str,
368
+ default="ckpts",
369
+ help="Root path of all the models, including t2v models and extra models.",
370
+ )
371
+ group.add_argument(
372
+ "--dit-weight",
373
+ type=str,
374
+ default="ckpts/hunyuan-video-t2v-720p/transformers/mp_rank_00_model_states.pt",
375
+ help="Path to the HunyuanVideo model. If None, search the model in the args.model_root."
376
+ "1. If it is a file, load the model directly."
377
+ "2. If it is a directory, search the model in the directory. Support two types of models: "
378
+ "1) named `pytorch_model_*.pt`"
379
+ "2) named `*_model_states.pt`, where * can be `mp_rank_00`.",
380
+ )
381
+ group.add_argument(
382
+ "--model-resolution",
383
+ type=str,
384
+ default="540p",
385
+ choices=["540p", "720p"],
386
+ help="Root path of all the models, including t2v models and extra models.",
387
+ )
388
+ group.add_argument(
389
+ "--load-key",
390
+ type=str,
391
+ default="module",
392
+ help="Key to load the model states. 'module' for the main model, 'ema' for the EMA model.",
393
+ )
394
+ group.add_argument(
395
+ "--use-cpu-offload",
396
+ action="store_true",
397
+ help="Use CPU offload for the model load.",
398
+ )
399
+
400
+ # ======================== Inference general setting ========================
401
+ group.add_argument(
402
+ "--batch-size",
403
+ type=int,
404
+ default=1,
405
+ help="Batch size for inference and evaluation.",
406
+ )
407
+ group.add_argument(
408
+ "--infer-steps",
409
+ type=int,
410
+ default=50,
411
+ help="Number of denoising steps for inference.",
412
+ )
413
+ group.add_argument(
414
+ "--disable-autocast",
415
+ action="store_true",
416
+ help="Disable autocast for denoising loop and vae decoding in pipeline sampling.",
417
+ )
418
+ group.add_argument(
419
+ "--save-path",
420
+ type=str,
421
+ default="./results",
422
+ help="Path to save the generated samples.",
423
+ )
424
+ group.add_argument(
425
+ "--save-path-suffix",
426
+ type=str,
427
+ default="",
428
+ help="Suffix for the directory of saved samples.",
429
+ )
430
+ group.add_argument(
431
+ "--name-suffix",
432
+ type=str,
433
+ default="",
434
+ help="Suffix for the names of saved samples.",
435
+ )
436
+ group.add_argument(
437
+ "--num-videos",
438
+ type=int,
439
+ default=1,
440
+ help="Number of videos to generate for each prompt.",
441
+ )
442
+ # ---sample size---
443
+ group.add_argument(
444
+ "--video-size",
445
+ type=int,
446
+ nargs="+",
447
+ default=(720, 1280),
448
+ help="Video size for training. If a single value is provided, it will be used for both height "
449
+ "and width. If two values are provided, they will be used for height and width "
450
+ "respectively.",
451
+ )
452
+ group.add_argument(
453
+ "--video-length",
454
+ type=int,
455
+ default=129,
456
+ help="How many frames to sample from a video. if using 3d vae, the number should be 4n+1",
457
+ )
458
+ # --- prompt ---
459
+ group.add_argument(
460
+ "--prompt",
461
+ type=str,
462
+ default=None,
463
+ help="Prompt for sampling during evaluation.",
464
+ )
465
+ group.add_argument(
466
+ "--seed-type",
467
+ type=str,
468
+ default="auto",
469
+ choices=["file", "random", "fixed", "auto"],
470
+ help="Seed type for evaluation. If file, use the seed from the CSV file. If random, generate a "
471
+ "random seed. If fixed, use the fixed seed given by `--seed`. If auto, `csv` will use the "
472
+ "seed column if available, otherwise use the fixed `seed` value. `prompt` will use the "
473
+ "fixed `seed` value.",
474
+ )
475
+ group.add_argument("--seed", type=int, default=None, help="Seed for evaluation.")
476
+
477
+ # Classifier-Free Guidance
478
+ group.add_argument(
479
+ "--neg-prompt", type=str, default=None, help="Negative prompt for sampling."
480
+ )
481
+ group.add_argument(
482
+ "--cfg-scale", type=float, default=1.0, help="Classifier free guidance scale."
483
+ )
484
+ group.add_argument(
485
+ "--embedded-cfg-scale",
486
+ type=float,
487
+ default=6.0,
488
+ help="Embeded classifier free guidance scale.",
489
+ )
490
+
491
+ group.add_argument(
492
+ "--reproduce",
493
+ action="store_true",
494
+ help="Enable reproducibility by setting random seeds and deterministic algorithms.",
495
+ )
496
+
497
+ return parser
498
+
499
+
500
+ def add_parallel_args(parser: argparse.ArgumentParser):
501
+ group = parser.add_argument_group(title="Parallel args")
502
+
503
+ # ======================== Model loads ========================
504
+ group.add_argument(
505
+ "--ulysses-degree",
506
+ type=int,
507
+ default=1,
508
+ help="Ulysses degree.",
509
+ )
510
+ group.add_argument(
511
+ "--ring-degree",
512
+ type=int,
513
+ default=1,
514
+ help="Ulysses degree.",
515
+ )
516
+
517
+ return parser
518
+
519
+
520
+ def sanity_check_args(args):
521
+ # VAE channels
522
+ vae_pattern = r"\d{2,3}-\d{1,2}c-\w+"
523
+ if not re.match(vae_pattern, args.vae):
524
+ raise ValueError(
525
+ f"Invalid VAE model: {args.vae}. Must be in the format of '{vae_pattern}'."
526
+ )
527
+ vae_channels = int(args.vae.split("-")[1][:-1])
528
+ if args.latent_channels is None:
529
+ args.latent_channels = vae_channels
530
+ if vae_channels != args.latent_channels:
531
+ raise ValueError(
532
+ f"Latent channels ({args.latent_channels}) must match the VAE channels ({vae_channels})."
533
+ )
534
+ return args
hyvideo/constants.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+
4
+ __all__ = [
5
+ "C_SCALE",
6
+ "PROMPT_TEMPLATE",
7
+ "MODEL_BASE",
8
+ "PRECISIONS",
9
+ "NORMALIZATION_TYPE",
10
+ "ACTIVATION_TYPE",
11
+ "VAE_PATH",
12
+ "TEXT_ENCODER_PATH",
13
+ "TOKENIZER_PATH",
14
+ "TEXT_PROJECTION",
15
+ "DATA_TYPE",
16
+ "NEGATIVE_PROMPT",
17
+ "NEGATIVE_PROMPT_I2V",
18
+ "FLOW_PATH_TYPE",
19
+ "FLOW_PREDICT_TYPE",
20
+ "FLOW_LOSS_WEIGHT",
21
+ "FLOW_SNR_TYPE",
22
+ "FLOW_SOLVER",
23
+ ]
24
+
25
+ PRECISION_TO_TYPE = {
26
+ 'fp32': torch.float32,
27
+ 'fp16': torch.float16,
28
+ 'bf16': torch.bfloat16,
29
+ }
30
+
31
+ # =================== Constant Values =====================
32
+ # Computation scale factor, 1P = 1_000_000_000_000_000. Tensorboard will display the value in PetaFLOPS to avoid
33
+ # overflow error when tensorboard logging values.
34
+ C_SCALE = 1_000_000_000_000_000
35
+
36
+ # When using decoder-only models, we must provide a prompt template to instruct the text encoder
37
+ # on how to generate the text.
38
+ # --------------------------------------------------------------------
39
+ PROMPT_TEMPLATE_ENCODE = (
40
+ "<|start_header_id|>system<|end_header_id|>\n\nDescribe the image by detailing the color, shape, size, texture, "
41
+ "quantity, text, spatial relationships of the objects and background:<|eot_id|>"
42
+ "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|>"
43
+ )
44
+ PROMPT_TEMPLATE_ENCODE_VIDEO = (
45
+ "<|start_header_id|>system<|end_header_id|>\n\nDescribe the video by detailing the following aspects: "
46
+ "1. The main content and theme of the video."
47
+ "2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects."
48
+ "3. Actions, events, behaviors temporal relationships, physical movement changes of the objects."
49
+ "4. background environment, light, style and atmosphere."
50
+ "5. camera angles, movements, and transitions used in the video:<|eot_id|>"
51
+ "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|>"
52
+ )
53
+
54
+ PROMPT_TEMPLATE_ENCODE_I2V = (
55
+ "<|start_header_id|>system<|end_header_id|>\n\n<image>\nDescribe the image by detailing the color, shape, size, texture, "
56
+ "quantity, text, spatial relationships of the objects and background:<|eot_id|>"
57
+ "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|>"
58
+ "<|start_header_id|>assistant<|end_header_id|>\n\n"
59
+ )
60
+
61
+ PROMPT_TEMPLATE_ENCODE_VIDEO_I2V = (
62
+ "<|start_header_id|>system<|end_header_id|>\n\n<image>\nDescribe the video by detailing the following aspects according to the reference image: "
63
+ "1. The main content and theme of the video."
64
+ "2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects."
65
+ "3. Actions, events, behaviors temporal relationships, physical movement changes of the objects."
66
+ "4. background environment, light, style and atmosphere."
67
+ "5. camera angles, movements, and transitions used in the video:<|eot_id|>\n\n"
68
+ "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|>"
69
+ "<|start_header_id|>assistant<|end_header_id|>\n\n"
70
+ )
71
+
72
+ NEGATIVE_PROMPT = "Aerial view, aerial view, overexposed, low quality, deformation, a poor composition, bad hands, bad teeth, bad eyes, bad limbs, distortion"
73
+ NEGATIVE_PROMPT_I2V = "deformation, a poor composition and deformed video, bad teeth, bad eyes, bad limbs"
74
+
75
+ PROMPT_TEMPLATE = {
76
+ "dit-llm-encode": {
77
+ "template": PROMPT_TEMPLATE_ENCODE,
78
+ "crop_start": 36,
79
+ },
80
+ "dit-llm-encode-video": {
81
+ "template": PROMPT_TEMPLATE_ENCODE_VIDEO,
82
+ "crop_start": 95,
83
+ },
84
+ "dit-llm-encode-i2v": {
85
+ "template": PROMPT_TEMPLATE_ENCODE_I2V,
86
+ "crop_start": 36,
87
+ "image_emb_start": 5,
88
+ "image_emb_end": 581,
89
+ "image_emb_len": 576,
90
+ "double_return_token_id": 271
91
+ },
92
+ "dit-llm-encode-video-i2v": {
93
+ "template": PROMPT_TEMPLATE_ENCODE_VIDEO_I2V,
94
+ "crop_start": 103,
95
+ "image_emb_start": 5,
96
+ "image_emb_end": 581,
97
+ "image_emb_len": 576,
98
+ "double_return_token_id": 271
99
+ },
100
+ }
101
+
102
+ # ======================= Model ======================
103
+ PRECISIONS = {"fp32", "fp16", "bf16"}
104
+ NORMALIZATION_TYPE = {"layer", "rms"}
105
+ ACTIVATION_TYPE = {"relu", "silu", "gelu", "gelu_tanh"}
106
+
107
+ # =================== Model Path =====================
108
+ MODEL_BASE = os.getenv("MODEL_BASE", "./ckpts")
109
+
110
+ # =================== Data =======================
111
+ DATA_TYPE = {"image", "video", "image_video"}
112
+
113
+ # 3D VAE
114
+ VAE_PATH = {"884-16c-hy": f"{MODEL_BASE}/hunyuan-video-t2v-720p/vae"}
115
+
116
+ # Text Encoder
117
+ TEXT_ENCODER_PATH = {
118
+ "clipL": f"{MODEL_BASE}/clip_vit_large_patch14",
119
+ "llm": f"{MODEL_BASE}/llava-llama-3-8b",
120
+ "llm-i2v": f"{MODEL_BASE}/llava-llama-3-8b",
121
+ }
122
+
123
+ # Tokenizer
124
+ TOKENIZER_PATH = {
125
+ "clipL": f"{MODEL_BASE}/clip_vit_large_patch14",
126
+ "llm": f"{MODEL_BASE}/llava-llama-3-8b",
127
+ "llm-i2v": f"{MODEL_BASE}/llava-llama-3-8b",
128
+ }
129
+
130
+ TEXT_PROJECTION = {
131
+ "linear", # Default, an nn.Linear() layer
132
+ "single_refiner", # Single TokenRefiner. Refer to LI-DiT
133
+ }
134
+
135
+ # Flow Matching path type
136
+ FLOW_PATH_TYPE = {
137
+ "linear", # Linear trajectory between noise and data
138
+ "gvp", # Generalized variance-preserving SDE
139
+ "vp", # Variance-preserving SDE
140
+ }
141
+
142
+ # Flow Matching predict type
143
+ FLOW_PREDICT_TYPE = {
144
+ "velocity", # Predict velocity
145
+ "score", # Predict score
146
+ "noise", # Predict noise
147
+ }
148
+
149
+ # Flow Matching loss weight
150
+ FLOW_LOSS_WEIGHT = {
151
+ "velocity", # Weight loss by velocity
152
+ "likelihood", # Weight loss by likelihood
153
+ }
154
+
155
+ # Flow Matching SNR type
156
+ FLOW_SNR_TYPE = {
157
+ "lognorm", # Log-normal SNR
158
+ "uniform", # Uniform SNR
159
+ }
160
+
161
+ # Flow Matching solvers
162
+ FLOW_SOLVER = {
163
+ "euler", # Euler solver
164
+ }
hyvideo/data_kits/audio_dataset.py ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import cv2
3
+ import math
4
+ import json
5
+ import torch
6
+ import random
7
+ import librosa
8
+ import traceback
9
+ import torchvision
10
+ import numpy as np
11
+ import pandas as pd
12
+ from PIL import Image
13
+ from einops import rearrange
14
+ from torch.utils.data import Dataset
15
+ from decord import VideoReader, cpu
16
+ from transformers import CLIPImageProcessor
17
+ import torchvision.transforms as transforms
18
+ from torchvision.transforms import ToPILImage
19
+
20
+
21
+
22
+ def get_audio_feature(feature_extractor, audio_path):
23
+ audio_input, sampling_rate = librosa.load(audio_path, sr=16000)
24
+ assert sampling_rate == 16000
25
+
26
+ audio_features = []
27
+ window = 750*640
28
+ for i in range(0, len(audio_input), window):
29
+ audio_feature = feature_extractor(audio_input[i:i+window],
30
+ sampling_rate=sampling_rate,
31
+ return_tensors="pt",
32
+ ).input_features
33
+ audio_features.append(audio_feature)
34
+
35
+ audio_features = torch.cat(audio_features, dim=-1)
36
+ return audio_features, len(audio_input) // 640
37
+
38
+
39
+ class VideoAudioTextLoaderVal(Dataset):
40
+ def __init__(
41
+ self,
42
+ image_size: int,
43
+ meta_file: str,
44
+ **kwargs,
45
+ ):
46
+ super().__init__()
47
+ self.meta_file = meta_file
48
+ self.image_size = image_size
49
+ self.text_encoder = kwargs.get("text_encoder", None) # llava_text_encoder
50
+ self.text_encoder_2 = kwargs.get("text_encoder_2", None) # clipL_text_encoder
51
+ self.feature_extractor = kwargs.get("feature_extractor", None)
52
+ self.meta_files = []
53
+
54
+ csv_data = pd.read_csv(meta_file)
55
+ for idx in range(len(csv_data)):
56
+ self.meta_files.append(
57
+ {
58
+ "videoid": str(csv_data["videoid"][idx]),
59
+ "image_path": str(csv_data["image"][idx]),
60
+ "audio_path": str(csv_data["audio"][idx]),
61
+ "prompt": str(csv_data["prompt"][idx]),
62
+ "fps": float(csv_data["fps"][idx])
63
+ }
64
+ )
65
+
66
+ self.llava_transform = transforms.Compose(
67
+ [
68
+ transforms.Resize((336, 336), interpolation=transforms.InterpolationMode.BILINEAR),
69
+ transforms.ToTensor(),
70
+ transforms.Normalize((0.48145466, 0.4578275, 0.4082107), (0.26862954, 0.26130258, 0.27577711)),
71
+ ]
72
+ )
73
+ self.clip_image_processor = CLIPImageProcessor()
74
+
75
+ self.device = torch.device("cuda")
76
+ self.weight_dtype = torch.float16
77
+
78
+
79
+ def __len__(self):
80
+ return len(self.meta_files)
81
+
82
+ @staticmethod
83
+ def get_text_tokens(text_encoder, description, dtype_encode="video"):
84
+ text_inputs = text_encoder.text2tokens(description, data_type=dtype_encode)
85
+ text_ids = text_inputs["input_ids"].squeeze(0)
86
+ text_mask = text_inputs["attention_mask"].squeeze(0)
87
+ return text_ids, text_mask
88
+
89
+ def get_batch_data(self, idx):
90
+ meta_file = self.meta_files[idx]
91
+ videoid = meta_file["videoid"]
92
+ image_path = meta_file["image_path"]
93
+ audio_path = meta_file["audio_path"]
94
+ prompt = "Authentic, Realistic, Natural, High-quality, Lens-Fixed, " + meta_file["prompt"]
95
+ fps = meta_file["fps"]
96
+
97
+ img_size = self.image_size
98
+ ref_image = Image.open(image_path).convert('RGB')
99
+
100
+ # Resize reference image
101
+ w, h = ref_image.size
102
+ scale = img_size / min(w, h)
103
+ new_w = round(w * scale / 64) * 64
104
+ new_h = round(h * scale / 64) * 64
105
+
106
+ if img_size == 704:
107
+ img_size_long = 1216
108
+ if new_w * new_h > img_size * img_size_long:
109
+ import math
110
+ scale = math.sqrt(img_size * img_size_long / w / h)
111
+ new_w = round(w * scale / 64) * 64
112
+ new_h = round(h * scale / 64) * 64
113
+
114
+ ref_image = ref_image.resize((new_w, new_h), Image.LANCZOS)
115
+
116
+ ref_image = np.array(ref_image)
117
+ ref_image = torch.from_numpy(ref_image)
118
+
119
+ audio_input, audio_len = get_audio_feature(self.feature_extractor, audio_path)
120
+ audio_prompts = audio_input[0]
121
+
122
+ motion_bucket_id_heads = np.array([25] * 4)
123
+ motion_bucket_id_exps = np.array([30] * 4)
124
+ motion_bucket_id_heads = torch.from_numpy(motion_bucket_id_heads)
125
+ motion_bucket_id_exps = torch.from_numpy(motion_bucket_id_exps)
126
+ fps = torch.from_numpy(np.array(fps))
127
+
128
+ to_pil = ToPILImage()
129
+ pixel_value_ref = rearrange(ref_image.clone().unsqueeze(0), "b h w c -> b c h w") # (b c h w)
130
+
131
+ pixel_value_ref_llava = [self.llava_transform(to_pil(image)) for image in pixel_value_ref]
132
+ pixel_value_ref_llava = torch.stack(pixel_value_ref_llava, dim=0)
133
+ pixel_value_ref_clip = self.clip_image_processor(
134
+ images=Image.fromarray((pixel_value_ref[0].permute(1,2,0)).data.cpu().numpy().astype(np.uint8)),
135
+ return_tensors="pt"
136
+ ).pixel_values[0]
137
+ pixel_value_ref_clip = pixel_value_ref_clip.unsqueeze(0)
138
+
139
+ # Encode text prompts
140
+
141
+ text_ids, text_mask = self.get_text_tokens(self.text_encoder, prompt)
142
+ text_ids_2, text_mask_2 = self.get_text_tokens(self.text_encoder_2, prompt)
143
+
144
+ # Output batch
145
+ batch = {
146
+ "text_prompt": prompt, #
147
+ "videoid": videoid,
148
+ "pixel_value_ref": pixel_value_ref.to(dtype=torch.float16), # 参考图,用于vae提特征 (1, 3, h, w), 取值范围(0, 255)
149
+ "pixel_value_ref_llava": pixel_value_ref_llava.to(dtype=torch.float16), # 参考图,用于llava提特征 (1, 3, 336, 336), 取值范围 = CLIP取值范围
150
+ "pixel_value_ref_clip": pixel_value_ref_clip.to(dtype=torch.float16), # 参考图,用于clip_image_encoder提特征 (1, 3, 244, 244), 取值范围 = CLIP取值范围
151
+ "audio_prompts": audio_prompts.to(dtype=torch.float16),
152
+ "motion_bucket_id_heads": motion_bucket_id_heads.to(dtype=text_ids.dtype),
153
+ "motion_bucket_id_exps": motion_bucket_id_exps.to(dtype=text_ids.dtype),
154
+ "fps": fps.to(dtype=torch.float16),
155
+ "text_ids": text_ids.clone(), # 对应llava_text_encoder
156
+ "text_mask": text_mask.clone(), # 对应llava_text_encoder
157
+ "text_ids_2": text_ids_2.clone(), # 对应clip_text_encoder
158
+ "text_mask_2": text_mask_2.clone(), # 对应clip_text_encoder
159
+ "audio_len": audio_len,
160
+ "image_path": image_path,
161
+ "audio_path": audio_path,
162
+ }
163
+ return batch
164
+
165
+ def __getitem__(self, idx):
166
+ return self.get_batch_data(idx)
167
+
168
+
169
+
170
+
hyvideo/data_kits/audio_preprocessor.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import os
3
+ import cv2
4
+ import json
5
+ import time
6
+ import decord
7
+ import einops
8
+ import librosa
9
+ import torch
10
+ import random
11
+ import argparse
12
+ import traceback
13
+ import numpy as np
14
+ from tqdm import tqdm
15
+ from PIL import Image
16
+ from einops import rearrange
17
+
18
+
19
+
20
+ def get_facemask(ref_image, align_instance, area=1.25):
21
+ # ref_image: (b f c h w)
22
+ bsz, f, c, h, w = ref_image.shape
23
+ images = rearrange(ref_image, "b f c h w -> (b f) h w c").data.cpu().numpy().astype(np.uint8)
24
+ face_masks = []
25
+ for image in images:
26
+ image_pil = Image.fromarray(image).convert("RGB")
27
+ _, _, bboxes_list = align_instance(np.array(image_pil)[:,:,[2,1,0]], maxface=True)
28
+ try:
29
+ bboxSrc = bboxes_list[0]
30
+ except:
31
+ bboxSrc = [0, 0, w, h]
32
+ x1, y1, ww, hh = bboxSrc
33
+ x2, y2 = x1 + ww, y1 + hh
34
+ ww, hh = (x2-x1) * area, (y2-y1) * area
35
+ center = [(x2+x1)//2, (y2+y1)//2]
36
+ x1 = max(center[0] - ww//2, 0)
37
+ y1 = max(center[1] - hh//2, 0)
38
+ x2 = min(center[0] + ww//2, w)
39
+ y2 = min(center[1] + hh//2, h)
40
+
41
+ face_mask = np.zeros_like(np.array(image_pil))
42
+ face_mask[int(y1):int(y2), int(x1):int(x2)] = 1.0
43
+ face_masks.append(torch.from_numpy(face_mask[...,:1]))
44
+ face_masks = torch.stack(face_masks, dim=0) # (b*f, h, w, c)
45
+ face_masks = rearrange(face_masks, "(b f) h w c -> b c f h w", b=bsz, f=f)
46
+ face_masks = face_masks.to(device=ref_image.device, dtype=ref_image.dtype)
47
+ return face_masks
48
+
49
+
50
+ def encode_audio(wav2vec, audio_feats, fps, num_frames=129):
51
+ if fps == 25:
52
+ start_ts = [0]
53
+ step_ts = [1]
54
+ elif fps == 12.5:
55
+ start_ts = [0]
56
+ step_ts = [2]
57
+ else:
58
+ start_ts = [0]
59
+ step_ts = [1]
60
+
61
+ num_frames = min(num_frames, 400)
62
+ audio_feats = wav2vec.encoder(audio_feats.unsqueeze(0)[:, :, :3000], output_hidden_states=True).hidden_states
63
+ audio_feats = torch.stack(audio_feats, dim=2)
64
+ audio_feats = torch.cat([torch.zeros_like(audio_feats[:,:4]), audio_feats], 1)
65
+
66
+ audio_prompts = []
67
+ for bb in range(1):
68
+ audio_feats_list = []
69
+ for f in range(num_frames):
70
+ cur_t = (start_ts[bb] + f * step_ts[bb]) * 2
71
+ audio_clip = audio_feats[bb:bb+1, cur_t: cur_t+10]
72
+ audio_feats_list.append(audio_clip)
73
+ audio_feats_list = torch.stack(audio_feats_list, 1)
74
+ audio_prompts.append(audio_feats_list)
75
+ audio_prompts = torch.cat(audio_prompts)
76
+ return audio_prompts
hyvideo/data_kits/data_tools.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import cv2
3
+ import torch
4
+ import numpy as np
5
+ import imageio
6
+ import torchvision
7
+ from einops import rearrange
8
+
9
+
10
+ def save_videos_grid(videos: torch.Tensor, path: str, rescale=False, n_rows=6, fps=8, quality=8):
11
+ videos = rearrange(videos, "b c t h w -> t b c h w")
12
+ outputs = []
13
+ for x in videos:
14
+ x = torchvision.utils.make_grid(x, nrow=n_rows)
15
+ x = x.transpose(0, 1).transpose(1, 2).squeeze(-1)
16
+ if rescale:
17
+ x = (x + 1.0) / 2.0 # -1,1 -> 0,1
18
+ x = torch.clamp(x,0,1)
19
+ x = (x * 255).numpy().astype(np.uint8)
20
+ outputs.append(x)
21
+
22
+ os.makedirs(os.path.dirname(path), exist_ok=True)
23
+ imageio.mimsave(path, outputs, fps=fps, quality=quality)
24
+
25
+ def pad_image(crop_img, size, color=(255, 255, 255), resize_ratio=1):
26
+ crop_h, crop_w = crop_img.shape[:2]
27
+ target_w, target_h = size
28
+ scale_h, scale_w = target_h / crop_h, target_w / crop_w
29
+ if scale_w > scale_h:
30
+ resize_h = int(target_h*resize_ratio)
31
+ resize_w = int(crop_w / crop_h * resize_h)
32
+ else:
33
+ resize_w = int(target_w*resize_ratio)
34
+ resize_h = int(crop_h / crop_w * resize_w)
35
+ crop_img = cv2.resize(crop_img, (resize_w, resize_h))
36
+ pad_left = (target_w - resize_w) // 2
37
+ pad_top = (target_h - resize_h) // 2
38
+ pad_right = target_w - resize_w - pad_left
39
+ pad_bottom = target_h - resize_h - pad_top
40
+ crop_img = cv2.copyMakeBorder(crop_img, pad_top, pad_bottom, pad_left, pad_right, cv2.BORDER_CONSTANT, value=color)
41
+ return crop_img
hyvideo/data_kits/face_align/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .align import AlignImage
hyvideo/data_kits/face_align/align.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import torch
4
+ from .detface import DetFace
5
+
6
+ class AlignImage(object):
7
+ def __init__(self, device='cuda', det_path=''):
8
+ self.facedet = DetFace(pt_path=det_path, confThreshold=0.5, nmsThreshold=0.45, device=device)
9
+
10
+ @torch.no_grad()
11
+ def __call__(self, im, maxface=False):
12
+ bboxes, kpss, scores = self.facedet.detect(im)
13
+ face_num = bboxes.shape[0]
14
+
15
+ five_pts_list = []
16
+ scores_list = []
17
+ bboxes_list = []
18
+ for i in range(face_num):
19
+ five_pts_list.append(kpss[i].reshape(5,2))
20
+ scores_list.append(scores[i])
21
+ bboxes_list.append(bboxes[i])
22
+
23
+ if maxface and face_num>1:
24
+ max_idx = 0
25
+ max_area = (bboxes[0, 2])*(bboxes[0, 3])
26
+ for i in range(1, face_num):
27
+ area = (bboxes[i,2])*(bboxes[i,3])
28
+ if area>max_area:
29
+ max_idx = i
30
+ five_pts_list = [five_pts_list[max_idx]]
31
+ scores_list = [scores_list[max_idx]]
32
+ bboxes_list = [bboxes_list[max_idx]]
33
+
34
+ return five_pts_list, scores_list, bboxes_list