Update app.py
Browse files
app.py
CHANGED
|
@@ -1,4 +1,9 @@
|
|
| 1 |
import os
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
os.system('pip install --upgrade --pre --extra-index-url https://download.pytorch.org/whl/nightly/cu126 "torch<2.9" spaces')
|
| 3 |
|
| 4 |
import spaces
|
|
@@ -13,7 +18,13 @@ import numpy as np
|
|
| 13 |
from PIL import Image
|
| 14 |
import random
|
| 15 |
import gc
|
| 16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
|
| 18 |
# Model configurations
|
| 19 |
T2V_MODEL_ID = "Wan-AI/Wan2.2-T2V-A14B-Diffusers"
|
|
@@ -24,62 +35,89 @@ MAX_SEED = np.iinfo(np.int32).max
|
|
| 24 |
FIXED_FPS = 16
|
| 25 |
MIN_FRAMES_MODEL = 8
|
| 26 |
MAX_FRAMES_MODEL = 81
|
| 27 |
-
MIN_DURATION = round(MIN_FRAMES_MODEL/FIXED_FPS,1)
|
| 28 |
-
MAX_DURATION = round(MAX_FRAMES_MODEL/FIXED_FPS,1)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
),
|
| 38 |
-
transformer_2=WanTransformer3DModel.from_pretrained('linoyts/Wan2.2-T2V-A14B-Diffusers-BF16',
|
| 39 |
-
subfolder='transformer_2',
|
| 40 |
-
torch_dtype=torch.bfloat16,
|
| 41 |
-
device_map='cuda',
|
| 42 |
-
),
|
| 43 |
-
vae=vae,
|
| 44 |
-
torch_dtype=torch.bfloat16,
|
| 45 |
-
).to('cuda')
|
| 46 |
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
subfolder=
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
)
|
| 75 |
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
)
|
| 83 |
|
| 84 |
# Default prompts
|
| 85 |
default_prompt_t2v = "Two anthropomorphic cats in comfy boxing gear and bright gloves fight intensely on a spotlighted stage."
|
|
@@ -107,11 +145,23 @@ def resize_image_landscape(image: Image.Image) -> Image.Image:
|
|
| 107 |
image = image.crop((0, top, width, top + new_height))
|
| 108 |
return image.resize((LANDSCAPE_WIDTH, LANDSCAPE_HEIGHT), Image.LANCZOS)
|
| 109 |
|
| 110 |
-
def get_duration(
|
| 111 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
return int(steps) * 15
|
| 113 |
|
| 114 |
@spaces.GPU(duration=get_duration)
|
|
|
|
| 115 |
def generate_video(
|
| 116 |
mode,
|
| 117 |
input_image,
|
|
@@ -125,14 +175,13 @@ def generate_video(
|
|
| 125 |
randomize_seed=False,
|
| 126 |
progress=gr.Progress(track_tqdm=True),
|
| 127 |
):
|
| 128 |
-
if mode == "Image-to-Video" and input_image is None:
|
| 129 |
-
raise gr.Error("Please upload an input image for Image-to-Video mode.")
|
| 130 |
-
|
| 131 |
num_frames = np.clip(int(round(duration_seconds * FIXED_FPS)), MIN_FRAMES_MODEL, MAX_FRAMES_MODEL)
|
| 132 |
current_seed = random.randint(0, MAX_SEED) if randomize_seed else int(seed)
|
| 133 |
|
| 134 |
if mode == "Text-to-Video":
|
| 135 |
-
|
|
|
|
|
|
|
| 136 |
prompt=prompt,
|
| 137 |
negative_prompt=negative_prompt,
|
| 138 |
height=LANDSCAPE_HEIGHT,
|
|
@@ -144,8 +193,12 @@ def generate_video(
|
|
| 144 |
generator=torch.Generator(device="cuda").manual_seed(current_seed),
|
| 145 |
).frames[0]
|
| 146 |
else: # Image-to-Video
|
|
|
|
|
|
|
|
|
|
|
|
|
| 147 |
resized_image = resize_image(input_image)
|
| 148 |
-
output_frames_list =
|
| 149 |
image=resized_image,
|
| 150 |
prompt=prompt,
|
| 151 |
negative_prompt=negative_prompt,
|
|
@@ -162,6 +215,7 @@ def generate_video(
|
|
| 162 |
video_path = tmpfile.name
|
| 163 |
|
| 164 |
export_to_video(output_frames_list, video_path, fps=FIXED_FPS)
|
|
|
|
| 165 |
return video_path, current_seed
|
| 166 |
|
| 167 |
with gr.Blocks() as demo:
|
|
@@ -201,6 +255,7 @@ with gr.Blocks() as demo:
|
|
| 201 |
|
| 202 |
gr.Examples(
|
| 203 |
examples=[
|
|
|
|
| 204 |
["Two anthropomorphic cats in comfy boxing gear and bright gloves fight intensely on a spotlighted stage."],
|
| 205 |
["A cinematic shot of a boat sailing on a calm sea at sunset."],
|
| 206 |
["Drone footage flying over a futuristic city with flying cars."],
|
|
|
|
| 1 |
import os
|
| 2 |
+
# Set environment variables before any imports to suppress inductor warnings
|
| 3 |
+
os.environ["TORCHINDUCTOR_CUDA_GRAPHS"] = "0"
|
| 4 |
+
os.environ["TORCHINDUCTOR_MAX_AUTOTUNE_GEMM"] = "0"
|
| 5 |
+
|
| 6 |
+
# Install dependencies as specified
|
| 7 |
os.system('pip install --upgrade --pre --extra-index-url https://download.pytorch.org/whl/nightly/cu126 "torch<2.9" spaces')
|
| 8 |
|
| 9 |
import spaces
|
|
|
|
| 18 |
from PIL import Image
|
| 19 |
import random
|
| 20 |
import gc
|
| 21 |
+
|
| 22 |
+
# Assuming optimize_pipeline_ is a custom function; if not available, define a no-op
|
| 23 |
+
try:
|
| 24 |
+
from optimization import optimize_pipeline_
|
| 25 |
+
except ImportError:
|
| 26 |
+
def optimize_pipeline_(pipe, **kwargs):
|
| 27 |
+
pass # No-op if optimization is not available
|
| 28 |
|
| 29 |
# Model configurations
|
| 30 |
T2V_MODEL_ID = "Wan-AI/Wan2.2-T2V-A14B-Diffusers"
|
|
|
|
| 35 |
FIXED_FPS = 16
|
| 36 |
MIN_FRAMES_MODEL = 8
|
| 37 |
MAX_FRAMES_MODEL = 81
|
| 38 |
+
MIN_DURATION = round(MIN_FRAMES_MODEL/FIXED_FPS, 1)
|
| 39 |
+
MAX_DURATION = round(MAX_FRAMES_MODEL/FIXED_FPS, 1)
|
| 40 |
+
|
| 41 |
+
# Cache for pipelines
|
| 42 |
+
t2v_pipe_cache = [None]
|
| 43 |
+
i2v_pipe_cache = [None]
|
| 44 |
|
| 45 |
+
def clear_memory():
|
| 46 |
+
"""Aggressively clear memory and CUDA cache."""
|
| 47 |
+
for _ in range(3):
|
| 48 |
+
gc.collect()
|
| 49 |
+
if torch.cuda.is_available():
|
| 50 |
+
torch.cuda.empty_cache()
|
| 51 |
+
torch.cuda.synchronize()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
|
| 53 |
+
def load_t2v_pipeline():
|
| 54 |
+
"""Load and optimize the T2V pipeline."""
|
| 55 |
+
if t2v_pipe_cache[0] is None:
|
| 56 |
+
vae = AutoencoderKLWan.from_pretrained(T2V_MODEL_ID, subfolder="vae", torch_dtype=torch.float32)
|
| 57 |
+
t2v_pipe_cache[0] = WanPipeline.from_pretrained(T2V_MODEL_ID,
|
| 58 |
+
transformer=WanTransformer3DModel.from_pretrained('linoyts/Wan2.2-T2V-A14B-Diffusers-BF16',
|
| 59 |
+
subfolder='transformer',
|
| 60 |
+
torch_dtype=torch.bfloat16,
|
| 61 |
+
device_map='cuda',
|
| 62 |
+
),
|
| 63 |
+
transformer_2=WanTransformer3DModel.from_pretrained('linoyts/Wan2.2-T2V-A14B-Diffusers-BF16',
|
| 64 |
+
subfolder='transformer_2',
|
| 65 |
+
torch_dtype=torch.bfloat16,
|
| 66 |
+
device_map='cuda',
|
| 67 |
+
),
|
| 68 |
+
vae=vae,
|
| 69 |
+
torch_dtype=torch.bfloat16,
|
| 70 |
+
).to('cuda')
|
| 71 |
+
optimize_pipeline_(t2v_pipe_cache[0],
|
| 72 |
+
prompt='prompt',
|
| 73 |
+
height=LANDSCAPE_HEIGHT,
|
| 74 |
+
width=LANDSCAPE_WIDTH,
|
| 75 |
+
num_frames=MAX_FRAMES_MODEL,
|
| 76 |
+
)
|
| 77 |
+
t2v_pipe_cache[0].enable_model_cpu_offload() # Enable CPU offload for memory optimization
|
| 78 |
+
clear_memory()
|
| 79 |
+
return t2v_pipe_cache[0]
|
| 80 |
|
| 81 |
+
def load_i2v_pipeline():
|
| 82 |
+
"""Load and optimize the I2V pipeline."""
|
| 83 |
+
if i2v_pipe_cache[0] is None:
|
| 84 |
+
i2v_pipe_cache[0] = WanImageToVideoPipeline.from_pretrained(I2V_MODEL_ID,
|
| 85 |
+
transformer=WanTransformer3DModel.from_pretrained('cbensimon/Wan2.2-I2V-A14B-bf16-Diffusers',
|
| 86 |
+
subfolder='transformer',
|
| 87 |
+
torch_dtype=torch.bfloat16,
|
| 88 |
+
device_map='cuda',
|
| 89 |
+
),
|
| 90 |
+
transformer_2=WanTransformer3DModel.from_pretrained('cbensimon/Wan2.2-I2V-A14B-bf16-Diffusers',
|
| 91 |
+
subfolder='transformer_2',
|
| 92 |
+
torch_dtype=torch.bfloat16,
|
| 93 |
+
device_map='cuda',
|
| 94 |
+
),
|
| 95 |
+
torch_dtype=torch.bfloat16,
|
| 96 |
+
).to('cuda')
|
| 97 |
+
optimize_pipeline_(i2v_pipe_cache[0],
|
| 98 |
+
image=Image.new('RGB', (LANDSCAPE_WIDTH, LANDSCAPE_HEIGHT)),
|
| 99 |
+
prompt='prompt',
|
| 100 |
+
height=LANDSCAPE_HEIGHT,
|
| 101 |
+
width=LANDSCAPE_WIDTH,
|
| 102 |
+
num_frames=MAX_FRAMES_MODEL,
|
| 103 |
+
)
|
| 104 |
+
i2v_pipe_cache[0].enable_model_cpu_offload() # Enable CPU offload for memory optimization
|
| 105 |
+
clear_memory()
|
| 106 |
+
return i2v_pipe_cache[0]
|
| 107 |
|
| 108 |
+
def unload_t2v_pipeline():
|
| 109 |
+
if t2v_pipe_cache[0] is not None:
|
| 110 |
+
t2v_pipe_cache[0].to("cpu")
|
| 111 |
+
del t2v_pipe_cache[0]
|
| 112 |
+
t2v_pipe_cache[0] = None
|
| 113 |
+
clear_memory()
|
|
|
|
| 114 |
|
| 115 |
+
def unload_i2v_pipeline():
|
| 116 |
+
if i2v_pipe_cache[0] is not None:
|
| 117 |
+
i2v_pipe_cache[0].to("cpu")
|
| 118 |
+
del i2v_pipe_cache[0]
|
| 119 |
+
i2v_pipe_cache[0] = None
|
| 120 |
+
clear_memory()
|
|
|
|
| 121 |
|
| 122 |
# Default prompts
|
| 123 |
default_prompt_t2v = "Two anthropomorphic cats in comfy boxing gear and bright gloves fight intensely on a spotlighted stage."
|
|
|
|
| 145 |
image = image.crop((0, top, width, top + new_height))
|
| 146 |
return image.resize((LANDSCAPE_WIDTH, LANDSCAPE_HEIGHT), Image.LANCZOS)
|
| 147 |
|
| 148 |
+
def get_duration(
|
| 149 |
+
mode,
|
| 150 |
+
input_image,
|
| 151 |
+
prompt,
|
| 152 |
+
negative_prompt,
|
| 153 |
+
duration_seconds,
|
| 154 |
+
guidance_scale,
|
| 155 |
+
guidance_scale_2,
|
| 156 |
+
steps,
|
| 157 |
+
seed,
|
| 158 |
+
randomize_seed,
|
| 159 |
+
progress,
|
| 160 |
+
):
|
| 161 |
return int(steps) * 15
|
| 162 |
|
| 163 |
@spaces.GPU(duration=get_duration)
|
| 164 |
+
@torch.no_grad()
|
| 165 |
def generate_video(
|
| 166 |
mode,
|
| 167 |
input_image,
|
|
|
|
| 175 |
randomize_seed=False,
|
| 176 |
progress=gr.Progress(track_tqdm=True),
|
| 177 |
):
|
|
|
|
|
|
|
|
|
|
| 178 |
num_frames = np.clip(int(round(duration_seconds * FIXED_FPS)), MIN_FRAMES_MODEL, MAX_FRAMES_MODEL)
|
| 179 |
current_seed = random.randint(0, MAX_SEED) if randomize_seed else int(seed)
|
| 180 |
|
| 181 |
if mode == "Text-to-Video":
|
| 182 |
+
unload_i2v_pipeline() # Unload I2V to free memory
|
| 183 |
+
pipe = load_t2v_pipeline()
|
| 184 |
+
output_frames_list = pipe(
|
| 185 |
prompt=prompt,
|
| 186 |
negative_prompt=negative_prompt,
|
| 187 |
height=LANDSCAPE_HEIGHT,
|
|
|
|
| 193 |
generator=torch.Generator(device="cuda").manual_seed(current_seed),
|
| 194 |
).frames[0]
|
| 195 |
else: # Image-to-Video
|
| 196 |
+
unload_t2v_pipeline() # Unload T2V to free memory
|
| 197 |
+
pipe = load_i2v_pipeline()
|
| 198 |
+
if input_image is None:
|
| 199 |
+
raise gr.Error("Please upload an input image.")
|
| 200 |
resized_image = resize_image(input_image)
|
| 201 |
+
output_frames_list = pipe(
|
| 202 |
image=resized_image,
|
| 203 |
prompt=prompt,
|
| 204 |
negative_prompt=negative_prompt,
|
|
|
|
| 215 |
video_path = tmpfile.name
|
| 216 |
|
| 217 |
export_to_video(output_frames_list, video_path, fps=FIXED_FPS)
|
| 218 |
+
clear_memory() # Clean up after generation
|
| 219 |
return video_path, current_seed
|
| 220 |
|
| 221 |
with gr.Blocks() as demo:
|
|
|
|
| 255 |
|
| 256 |
gr.Examples(
|
| 257 |
examples=[
|
| 258 |
+
["POV selfie video, white cat with sunglasses standing on surfboard, relaxed smile, tropical beach behind (clear water, green hills, blue sky with clouds). Surfboard tips, cat falls into ocean, camera plunges underwater with bubbles and sunlight beams. Brief underwater view of cat’s face, then cat resurfaces, still filming selfie, playful summer vacation mood."],
|
| 259 |
["Two anthropomorphic cats in comfy boxing gear and bright gloves fight intensely on a spotlighted stage."],
|
| 260 |
["A cinematic shot of a boat sailing on a calm sea at sunset."],
|
| 261 |
["Drone footage flying over a futuristic city with flying cars."],
|