Spaces:
Running
on
Zero
Running
on
Zero
Update app.py (#2)
Browse files- Update app.py (c11c3fe799c45fa50d6c067190651848d8714290)
- Update app.py (b60f9e03d21e8ef40479b00292d38e05e7953385)
app.py
CHANGED
@@ -1,23 +1,20 @@
|
|
1 |
import torch
|
|
|
2 |
from diffusers import AutoencoderKLWan, WanVACEPipeline, UniPCMultistepScheduler
|
3 |
from diffusers.utils import export_to_video
|
4 |
-
from transformers import CLIPVisionModel
|
5 |
import gradio as gr
|
6 |
import tempfile
|
7 |
import spaces
|
8 |
from huggingface_hub import hf_hub_download
|
9 |
import numpy as np
|
10 |
-
import
|
11 |
import random
|
12 |
|
13 |
-
|
14 |
-
|
15 |
model_id = "Wan-AI/Wan2.1-VACE-14B-diffusers"
|
16 |
vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)
|
17 |
pipe = WanVACEPipeline.from_pretrained(model_id, vae=vae, torch_dtype=torch.bfloat16).to("cuda")
|
18 |
|
19 |
pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config, flow_shift=2.0)
|
20 |
-
pipe.to("cuda")
|
21 |
|
22 |
pipe.load_lora_weights(
|
23 |
"vrgamedevgirl84/Wan14BT2VFusioniX",
|
@@ -96,17 +93,17 @@ def update_prompt_from_mode(mode):
|
|
96 |
return MODE_PROMPTS.get(mode, "")
|
97 |
|
98 |
|
99 |
-
def prepare_video_and_mask_Ref2V(
|
100 |
frames = []
|
101 |
# Ideally, this should be 127.5 to match original code, but they perform computation on numpy arrays
|
102 |
# whereas we are passing PIL images. If you choose to pass numpy arrays, you can set it to 127.5 to
|
103 |
# match the original code.
|
104 |
-
frames.extend([
|
105 |
-
mask_white =
|
106 |
mask = [mask_white] * (num_frames)
|
107 |
return frames, mask
|
108 |
|
109 |
-
def prepare_video_and_mask_FLF2V(first_img:
|
110 |
first_img = first_img.resize((width, height))
|
111 |
last_img = last_img.resize((width, height))
|
112 |
frames = []
|
@@ -114,26 +111,26 @@ def prepare_video_and_mask_FLF2V(first_img: PIL.Image.Image, last_img: PIL.Image
|
|
114 |
# Ideally, this should be 127.5 to match original code, but they perform computation on numpy arrays
|
115 |
# whereas we are passing PIL images. If you choose to pass numpy arrays, you can set it to 127.5 to
|
116 |
# match the original code.
|
117 |
-
frames.extend([
|
118 |
frames.append(last_img)
|
119 |
-
mask_black =
|
120 |
-
mask_white =
|
121 |
mask = [mask_black, *[mask_white] * (num_frames - 2), mask_black]
|
122 |
return frames, mask
|
123 |
|
124 |
-
def prepare_video_and_mask_Random2V(images: List[
|
125 |
images = [img.resize((width, height)) for img in images]
|
126 |
# Ideally, this should be 127.5 to match original code, but they perform computation on numpy arrays
|
127 |
# whereas we are passing PIL images. If you choose to pass numpy arrays, you can set it to 127.5 to
|
128 |
# match the original code.
|
129 |
-
frames = [
|
130 |
|
131 |
-
mask_black =
|
132 |
-
mask_white =
|
133 |
mask = [mask_white] * num_frames
|
134 |
|
135 |
for img, idx in zip(images, frame_indices):
|
136 |
-
assert idx < num_frames
|
137 |
frames[idx] = img
|
138 |
mask[idx] = mask_black
|
139 |
|
@@ -183,7 +180,7 @@ def generate_video(gallery_images, mode, prompt, height, width,
|
|
183 |
if mode == "FLF2V" and len(gallery_images) >= 2:
|
184 |
gallery_images = gallery_images[:2]
|
185 |
elif mode == "FLF2V" and len(gallery_images) < 2:
|
186 |
-
raise gr.Error("
|
187 |
|
188 |
target_h = max(MOD_VALUE, (int(height) // MOD_VALUE) * MOD_VALUE)
|
189 |
target_w = max(MOD_VALUE, (int(width) // MOD_VALUE) * MOD_VALUE)
|
@@ -192,20 +189,29 @@ def generate_video(gallery_images, mode, prompt, height, width,
|
|
192 |
|
193 |
current_seed = random.randint(0, MAX_SEED) if randomize_seed else int(seed)
|
194 |
|
195 |
-
|
196 |
# Process images based on the selected mode
|
197 |
if mode == "FLF2V":
|
198 |
-
frames, mask = prepare_video_and_mask_FLF2V(
|
199 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
200 |
elif mode == "Ref2V":
|
201 |
frames, mask = prepare_video_and_mask_Ref2V(height=target_h, width=target_w, num_frames=num_frames)
|
202 |
-
reference_images =gallery_images
|
203 |
-
else:
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
|
|
|
|
|
|
|
|
209 |
|
210 |
with torch.inference_mode():
|
211 |
output_frames_list = pipe(
|
@@ -228,8 +234,8 @@ def generate_video(gallery_images, mode, prompt, height, width,
|
|
228 |
return video_path, current_seed
|
229 |
|
230 |
with gr.Blocks() as demo:
|
231 |
-
gr.Markdown("#
|
232 |
-
gr.Markdown("[
|
233 |
|
234 |
with gr.Row():
|
235 |
with gr.Column():
|
@@ -251,11 +257,18 @@ with gr.Blocks() as demo:
|
|
251 |
choices=["Ref2V", "FLF2V", "Random2V"],
|
252 |
value="Ref2V",
|
253 |
label="Processing Mode",
|
254 |
-
info="Ref2V: Reference to Video | FLF2V: First-Last Frame to Video | Random2V: Random
|
255 |
)
|
256 |
|
257 |
prompt_input = gr.Textbox(label="Prompt", value=MODE_PROMPTS["Ref2V"])
|
258 |
-
duration_seconds_input = gr.Slider(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
259 |
|
260 |
with gr.Accordion("Advanced Settings", open=False):
|
261 |
negative_prompt_input = gr.Textbox(label="Negative Prompt", value=default_negative_prompt, lines=3)
|
@@ -271,12 +284,14 @@ with gr.Blocks() as demo:
|
|
271 |
|
272 |
with gr.Column():
|
273 |
video_output = gr.Video(label="Generated Video", autoplay=True, interactive=False)
|
274 |
-
with gr.Accordion("Mode Information", open=
|
275 |
gr.Markdown("""
|
276 |
**Processing Modes:**
|
277 |
-
- **Ref2V**: Uses
|
278 |
-
- **FLF2V**:
|
279 |
-
- **Random2V**:
|
|
|
|
|
280 |
""")
|
281 |
|
282 |
# Update prompt when mode changes
|
|
|
1 |
import torch
|
2 |
+
from typing import List
|
3 |
from diffusers import AutoencoderKLWan, WanVACEPipeline, UniPCMultistepScheduler
|
4 |
from diffusers.utils import export_to_video
|
|
|
5 |
import gradio as gr
|
6 |
import tempfile
|
7 |
import spaces
|
8 |
from huggingface_hub import hf_hub_download
|
9 |
import numpy as np
|
10 |
+
from PIL import Image
|
11 |
import random
|
12 |
|
|
|
|
|
13 |
model_id = "Wan-AI/Wan2.1-VACE-14B-diffusers"
|
14 |
vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)
|
15 |
pipe = WanVACEPipeline.from_pretrained(model_id, vae=vae, torch_dtype=torch.bfloat16).to("cuda")
|
16 |
|
17 |
pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config, flow_shift=2.0)
|
|
|
18 |
|
19 |
pipe.load_lora_weights(
|
20 |
"vrgamedevgirl84/Wan14BT2VFusioniX",
|
|
|
93 |
return MODE_PROMPTS.get(mode, "")
|
94 |
|
95 |
|
96 |
+
def prepare_video_and_mask_Ref2V(height: int, width: int, num_frames: int):
|
97 |
frames = []
|
98 |
# Ideally, this should be 127.5 to match original code, but they perform computation on numpy arrays
|
99 |
# whereas we are passing PIL images. If you choose to pass numpy arrays, you can set it to 127.5 to
|
100 |
# match the original code.
|
101 |
+
frames.extend([Image.new("RGB", (width, height), (128, 128, 128))] * (num_frames))
|
102 |
+
mask_white = Image.new("L", (width, height), 255)
|
103 |
mask = [mask_white] * (num_frames)
|
104 |
return frames, mask
|
105 |
|
106 |
+
def prepare_video_and_mask_FLF2V(first_img: Image.Image, last_img: Image.Image, height: int, width: int, num_frames: int):
|
107 |
first_img = first_img.resize((width, height))
|
108 |
last_img = last_img.resize((width, height))
|
109 |
frames = []
|
|
|
111 |
# Ideally, this should be 127.5 to match original code, but they perform computation on numpy arrays
|
112 |
# whereas we are passing PIL images. If you choose to pass numpy arrays, you can set it to 127.5 to
|
113 |
# match the original code.
|
114 |
+
frames.extend([Image.new("RGB", (width, height), (128, 128, 128))] * (num_frames - 2))
|
115 |
frames.append(last_img)
|
116 |
+
mask_black = Image.new("L", (width, height), 0)
|
117 |
+
mask_white = Image.new("L", (width, height), 255)
|
118 |
mask = [mask_black, *[mask_white] * (num_frames - 2), mask_black]
|
119 |
return frames, mask
|
120 |
|
121 |
+
def prepare_video_and_mask_Random2V(images: List[Image.Image], frame_indices: List[int], height: int, width: int, num_frames: int):
|
122 |
images = [img.resize((width, height)) for img in images]
|
123 |
# Ideally, this should be 127.5 to match original code, but they perform computation on numpy arrays
|
124 |
# whereas we are passing PIL images. If you choose to pass numpy arrays, you can set it to 127.5 to
|
125 |
# match the original code.
|
126 |
+
frames = [Image.new("RGB", (width, height), (128, 128, 128))] * num_frames
|
127 |
|
128 |
+
mask_black = Image.new("L", (width, height), 0)
|
129 |
+
mask_white = Image.new("L", (width, height), 255)
|
130 |
mask = [mask_white] * num_frames
|
131 |
|
132 |
for img, idx in zip(images, frame_indices):
|
133 |
+
assert idx < num_frames, f"Frame index {idx} exceeds num_frames {num_frames}"
|
134 |
frames[idx] = img
|
135 |
mask[idx] = mask_black
|
136 |
|
|
|
180 |
if mode == "FLF2V" and len(gallery_images) >= 2:
|
181 |
gallery_images = gallery_images[:2]
|
182 |
elif mode == "FLF2V" and len(gallery_images) < 2:
|
183 |
+
raise gr.Error("FLF2V mode requires at least 2 images, but only {} were supplied.".format(len(gallery_images)))
|
184 |
|
185 |
target_h = max(MOD_VALUE, (int(height) // MOD_VALUE) * MOD_VALUE)
|
186 |
target_w = max(MOD_VALUE, (int(width) // MOD_VALUE) * MOD_VALUE)
|
|
|
189 |
|
190 |
current_seed = random.randint(0, MAX_SEED) if randomize_seed else int(seed)
|
191 |
|
|
|
192 |
# Process images based on the selected mode
|
193 |
if mode == "FLF2V":
|
194 |
+
frames, mask = prepare_video_and_mask_FLF2V(
|
195 |
+
first_img=gallery_images[0],
|
196 |
+
last_img=gallery_images[1],
|
197 |
+
height=target_h,
|
198 |
+
width=target_w,
|
199 |
+
num_frames=num_frames
|
200 |
+
)
|
201 |
+
reference_images = None
|
202 |
elif mode == "Ref2V":
|
203 |
frames, mask = prepare_video_and_mask_Ref2V(height=target_h, width=target_w, num_frames=num_frames)
|
204 |
+
reference_images = gallery_images
|
205 |
+
else: # mode == "Random2V"
|
206 |
+
|
207 |
+
frames, mask = prepare_video_and_mask_Random2V(
|
208 |
+
images=gallery_images,
|
209 |
+
frame_indices=[0,20,40], # todo - generalize
|
210 |
+
height=target_h,
|
211 |
+
width=target_w,
|
212 |
+
num_frames=num_frames
|
213 |
+
)
|
214 |
+
reference_images = None
|
215 |
|
216 |
with torch.inference_mode():
|
217 |
output_frames_list = pipe(
|
|
|
234 |
return video_path, current_seed
|
235 |
|
236 |
with gr.Blocks() as demo:
|
237 |
+
gr.Markdown("# Wan 2.1 VACE (14B) with Phantom & Detail Enhancer LoRAs - Multi-Image Gallery")
|
238 |
+
gr.Markdown("Using [Wan2.1-VACE-14B](https://huggingface.co/Wan-AI/Wan2.1-VACE-14B-diffusers) with Phantom FusionX and Detail Enhancer LoRAs for advanced video generation with multiple conditioning modes.")
|
239 |
|
240 |
with gr.Row():
|
241 |
with gr.Column():
|
|
|
257 |
choices=["Ref2V", "FLF2V", "Random2V"],
|
258 |
value="Ref2V",
|
259 |
label="Processing Mode",
|
260 |
+
info="Ref2V: Reference to Video | FLF2V: First-Last Frame to Video | Random2V: Random frames to Video"
|
261 |
)
|
262 |
|
263 |
prompt_input = gr.Textbox(label="Prompt", value=MODE_PROMPTS["Ref2V"])
|
264 |
+
duration_seconds_input = gr.Slider(
|
265 |
+
minimum=round(MIN_FRAMES_MODEL/FIXED_FPS,1),
|
266 |
+
maximum=round(MAX_FRAMES_MODEL/FIXED_FPS,1),
|
267 |
+
step=0.1,
|
268 |
+
value=2,
|
269 |
+
label="Duration (seconds)",
|
270 |
+
info=f"Clamped to model's {MIN_FRAMES_MODEL}-{MAX_FRAMES_MODEL} frames at {FIXED_FPS}fps."
|
271 |
+
)
|
272 |
|
273 |
with gr.Accordion("Advanced Settings", open=False):
|
274 |
negative_prompt_input = gr.Textbox(label="Negative Prompt", value=default_negative_prompt, lines=3)
|
|
|
284 |
|
285 |
with gr.Column():
|
286 |
video_output = gr.Video(label="Generated Video", autoplay=True, interactive=False)
|
287 |
+
with gr.Accordion("Mode Information", open=False):
|
288 |
gr.Markdown("""
|
289 |
**Processing Modes:**
|
290 |
+
- **Ref2V**: Uses uploaded images as style references for video generation. All frames are generated based on the reference images.
|
291 |
+
- **FLF2V**: First-Last Frame mode - uses first and last images as keyframes and generates the frames in between (requires exactly 2 images)
|
292 |
+
- **Random2V**: Places uploaded images at specific frames in the video and generates the rest. Images are distributed evenly across the video duration.
|
293 |
+
|
294 |
+
**Note**: VACE pipeline supports advanced conditioning with masks and reference images for more control over generation.
|
295 |
""")
|
296 |
|
297 |
# Update prompt when mode changes
|