wan2.2-14B-TI2V-ALL

Paused

App Files Files Community

rahul7star commited on Aug 10

Commit

0079199

verified ·

1 Parent(s): f949e2d

Update app.py

Browse files

Files changed (1) hide show

app.py +114 -59

app.py CHANGED Viewed

@@ -1,4 +1,9 @@
 import os
 os.system('pip install --upgrade --pre --extra-index-url https://download.pytorch.org/whl/nightly/cu126 "torch<2.9" spaces')
 import spaces
@@ -13,7 +18,13 @@ import numpy as np
 from PIL import Image
 import random
 import gc
-from optimization import optimize_pipeline_
 # Model configurations
 T2V_MODEL_ID = "Wan-AI/Wan2.2-T2V-A14B-Diffusers"
@@ -24,62 +35,89 @@ MAX_SEED = np.iinfo(np.int32).max
 FIXED_FPS = 16
 MIN_FRAMES_MODEL = 8
 MAX_FRAMES_MODEL = 81
-MIN_DURATION = round(MIN_FRAMES_MODEL/FIXED_FPS,1)
-MAX_DURATION = round(MAX_FRAMES_MODEL/FIXED_FPS,1)
-# Initialize T2V pipeline
-vae = AutoencoderKLWan.from_pretrained(T2V_MODEL_ID, subfolder="vae", torch_dtype=torch.float32)
-t2v_pipe = WanPipeline.from_pretrained(T2V_MODEL_ID,
-    transformer=WanTransformer3DModel.from_pretrained('linoyts/Wan2.2-T2V-A14B-Diffusers-BF16',
-        subfolder='transformer',
-        torch_dtype=torch.bfloat16,
-        device_map='cuda',
-    ),
-    transformer_2=WanTransformer3DModel.from_pretrained('linoyts/Wan2.2-T2V-A14B-Diffusers-BF16',
-        subfolder='transformer_2',
-        torch_dtype=torch.bfloat16,
-        device_map='cuda',
-    ),
-    vae=vae,
-    torch_dtype=torch.bfloat16,
-).to('cuda')
-# Initialize I2V pipeline
-i2v_pipe = WanImageToVideoPipeline.from_pretrained(I2V_MODEL_ID,
-    transformer=WanTransformer3DModel.from_pretrained('cbensimon/Wan2.2-I2V-A14B-bf16-Diffusers',
-        subfolder='transformer',
-        torch_dtype=torch.bfloat16,
-        device_map='cuda',
-    ),
-    transformer_2=WanTransformer3DModel.from_pretrained('cbensimon/Wan2.2-I2V-A14B-bf16-Diffusers',
-        subfolder='transformer_2',
-        torch_dtype=torch.bfloat16,
-        device_map='cuda',
-    ),
-    torch_dtype=torch.bfloat16,
-).to('cuda')
-# Memory management
-for i in range(3):
-    gc.collect()
-    torch.cuda.synchronize()
-    torch.cuda.empty_cache()
-# Optimize pipelines
-optimize_pipeline_(t2v_pipe,
-    prompt='prompt',
-    height=LANDSCAPE_HEIGHT,
-    width=LANDSCAPE_WIDTH,
-    num_frames=MAX_FRAMES_MODEL,
-)
-optimize_pipeline_(i2v_pipe,
-    image=Image.new('RGB', (LANDSCAPE_WIDTH, LANDSCAPE_HEIGHT)),
-    prompt='prompt',
-    height=LANDSCAPE_HEIGHT,
-    width=LANDSCAPE_WIDTH,
-    num_frames=MAX_FRAMES_MODEL,
-)
 # Default prompts
 default_prompt_t2v = "Two anthropomorphic cats in comfy boxing gear and bright gloves fight intensely on a spotlighted stage."
@@ -107,11 +145,23 @@ def resize_image_landscape(image: Image.Image) -> Image.Image:
         image = image.crop((0, top, width, top + new_height))
     return image.resize((LANDSCAPE_WIDTH, LANDSCAPE_HEIGHT), Image.LANCZOS)
-def get_duration(*args, **kwargs):
-    steps = kwargs.get('steps', args[2] if len(args) > 2 else 4)
     return int(steps) * 15
 @spaces.GPU(duration=get_duration)
 def generate_video(
     mode,
     input_image,
@@ -125,14 +175,13 @@ def generate_video(
     randomize_seed=False,
     progress=gr.Progress(track_tqdm=True),
 ):
-    if mode == "Image-to-Video" and input_image is None:
-        raise gr.Error("Please upload an input image for Image-to-Video mode.")
     num_frames = np.clip(int(round(duration_seconds * FIXED_FPS)), MIN_FRAMES_MODEL, MAX_FRAMES_MODEL)
     current_seed = random.randint(0, MAX_SEED) if randomize_seed else int(seed)
     if mode == "Text-to-Video":
-        output_frames_list = t2v_pipe(
             prompt=prompt,
             negative_prompt=negative_prompt,
             height=LANDSCAPE_HEIGHT,
@@ -144,8 +193,12 @@ def generate_video(
             generator=torch.Generator(device="cuda").manual_seed(current_seed),
         ).frames[0]
     else:  # Image-to-Video
         resized_image = resize_image(input_image)
-        output_frames_list = i2v_pipe(
             image=resized_image,
             prompt=prompt,
             negative_prompt=negative_prompt,
@@ -162,6 +215,7 @@ def generate_video(
         video_path = tmpfile.name
     export_to_video(output_frames_list, video_path, fps=FIXED_FPS)
     return video_path, current_seed
 with gr.Blocks() as demo:
@@ -201,6 +255,7 @@ with gr.Blocks() as demo:
             gr.Examples(
                 examples=[
                     ["Two anthropomorphic cats in comfy boxing gear and bright gloves fight intensely on a spotlighted stage."],
                     ["A cinematic shot of a boat sailing on a calm sea at sunset."],
                     ["Drone footage flying over a futuristic city with flying cars."],

 import os
+# Set environment variables before any imports to suppress inductor warnings
+os.environ["TORCHINDUCTOR_CUDA_GRAPHS"] = "0"
+os.environ["TORCHINDUCTOR_MAX_AUTOTUNE_GEMM"] = "0"
+# Install dependencies as specified
 os.system('pip install --upgrade --pre --extra-index-url https://download.pytorch.org/whl/nightly/cu126 "torch<2.9" spaces')
 import spaces
 from PIL import Image
 import random
 import gc
+# Assuming optimize_pipeline_ is a custom function; if not available, define a no-op
+try:
+    from optimization import optimize_pipeline_
+except ImportError:
+    def optimize_pipeline_(pipe, **kwargs):
+        pass  # No-op if optimization is not available
 # Model configurations
 T2V_MODEL_ID = "Wan-AI/Wan2.2-T2V-A14B-Diffusers"
 FIXED_FPS = 16
 MIN_FRAMES_MODEL = 8
 MAX_FRAMES_MODEL = 81
+MIN_DURATION = round(MIN_FRAMES_MODEL/FIXED_FPS, 1)
+MAX_DURATION = round(MAX_FRAMES_MODEL/FIXED_FPS, 1)
+# Cache for pipelines
+t2v_pipe_cache = [None]
+i2v_pipe_cache = [None]
+def clear_memory():
+    """Aggressively clear memory and CUDA cache."""
+    for _ in range(3):
+        gc.collect()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            torch.cuda.synchronize()
+def load_t2v_pipeline():
+    """Load and optimize the T2V pipeline."""
+    if t2v_pipe_cache[0] is None:
+        vae = AutoencoderKLWan.from_pretrained(T2V_MODEL_ID, subfolder="vae", torch_dtype=torch.float32)
+        t2v_pipe_cache[0] = WanPipeline.from_pretrained(T2V_MODEL_ID,
+            transformer=WanTransformer3DModel.from_pretrained('linoyts/Wan2.2-T2V-A14B-Diffusers-BF16',
+                subfolder='transformer',
+                torch_dtype=torch.bfloat16,
+                device_map='cuda',
+            ),
+            transformer_2=WanTransformer3DModel.from_pretrained('linoyts/Wan2.2-T2V-A14B-Diffusers-BF16',
+                subfolder='transformer_2',
+                torch_dtype=torch.bfloat16,
+                device_map='cuda',
+            ),
+            vae=vae,
+            torch_dtype=torch.bfloat16,
+        ).to('cuda')
+        optimize_pipeline_(t2v_pipe_cache[0],
+            prompt='prompt',
+            height=LANDSCAPE_HEIGHT,
+            width=LANDSCAPE_WIDTH,
+            num_frames=MAX_FRAMES_MODEL,
+        )
+        t2v_pipe_cache[0].enable_model_cpu_offload()  # Enable CPU offload for memory optimization
+        clear_memory()
+    return t2v_pipe_cache[0]
+def load_i2v_pipeline():
+    """Load and optimize the I2V pipeline."""
+    if i2v_pipe_cache[0] is None:
+        i2v_pipe_cache[0] = WanImageToVideoPipeline.from_pretrained(I2V_MODEL_ID,
+            transformer=WanTransformer3DModel.from_pretrained('cbensimon/Wan2.2-I2V-A14B-bf16-Diffusers',
+                subfolder='transformer',
+                torch_dtype=torch.bfloat16,
+                device_map='cuda',
+            ),
+            transformer_2=WanTransformer3DModel.from_pretrained('cbensimon/Wan2.2-I2V-A14B-bf16-Diffusers',
+                subfolder='transformer_2',
+                torch_dtype=torch.bfloat16,
+                device_map='cuda',
+            ),
+            torch_dtype=torch.bfloat16,
+        ).to('cuda')
+        optimize_pipeline_(i2v_pipe_cache[0],
+            image=Image.new('RGB', (LANDSCAPE_WIDTH, LANDSCAPE_HEIGHT)),
+            prompt='prompt',
+            height=LANDSCAPE_HEIGHT,
+            width=LANDSCAPE_WIDTH,
+            num_frames=MAX_FRAMES_MODEL,
+        )
+        i2v_pipe_cache[0].enable_model_cpu_offload()  # Enable CPU offload for memory optimization
+        clear_memory()
+    return i2v_pipe_cache[0]
+def unload_t2v_pipeline():
+    if t2v_pipe_cache[0] is not None:
+        t2v_pipe_cache[0].to("cpu")
+        del t2v_pipe_cache[0]
+        t2v_pipe_cache[0] = None
+        clear_memory()
+def unload_i2v_pipeline():
+    if i2v_pipe_cache[0] is not None:
+        i2v_pipe_cache[0].to("cpu")
+        del i2v_pipe_cache[0]
+        i2v_pipe_cache[0] = None
+        clear_memory()
 # Default prompts
 default_prompt_t2v = "Two anthropomorphic cats in comfy boxing gear and bright gloves fight intensely on a spotlighted stage."
         image = image.crop((0, top, width, top + new_height))
     return image.resize((LANDSCAPE_WIDTH, LANDSCAPE_HEIGHT), Image.LANCZOS)
+def get_duration(
+    mode,
+    input_image,
+    prompt,
+    negative_prompt,
+    duration_seconds,
+    guidance_scale,
+    guidance_scale_2,
+    steps,
+    seed,
+    randomize_seed,
+    progress,
+):
     return int(steps) * 15
 @spaces.GPU(duration=get_duration)
+@torch.no_grad()
 def generate_video(
     mode,
     input_image,
     randomize_seed=False,
     progress=gr.Progress(track_tqdm=True),
 ):
     num_frames = np.clip(int(round(duration_seconds * FIXED_FPS)), MIN_FRAMES_MODEL, MAX_FRAMES_MODEL)
     current_seed = random.randint(0, MAX_SEED) if randomize_seed else int(seed)
     if mode == "Text-to-Video":
+        unload_i2v_pipeline()  # Unload I2V to free memory
+        pipe = load_t2v_pipeline()
+        output_frames_list = pipe(
             prompt=prompt,
             negative_prompt=negative_prompt,
             height=LANDSCAPE_HEIGHT,
             generator=torch.Generator(device="cuda").manual_seed(current_seed),
         ).frames[0]
     else:  # Image-to-Video
+        unload_t2v_pipeline()  # Unload T2V to free memory
+        pipe = load_i2v_pipeline()
+        if input_image is None:
+            raise gr.Error("Please upload an input image.")
         resized_image = resize_image(input_image)
+        output_frames_list = pipe(
             image=resized_image,
             prompt=prompt,
             negative_prompt=negative_prompt,
         video_path = tmpfile.name
     export_to_video(output_frames_list, video_path, fps=FIXED_FPS)
+    clear_memory()  # Clean up after generation
     return video_path, current_seed
 with gr.Blocks() as demo:
             gr.Examples(
                 examples=[
+                    ["POV selfie video, white cat with sunglasses standing on surfboard, relaxed smile, tropical beach behind (clear water, green hills, blue sky with clouds). Surfboard tips, cat falls into ocean, camera plunges underwater with bubbles and sunlight beams. Brief underwater view of cat’s face, then cat resurfaces, still filming selfie, playful summer vacation mood."],
                     ["Two anthropomorphic cats in comfy boxing gear and bright gloves fight intensely on a spotlighted stage."],
                     ["A cinematic shot of a boat sailing on a calm sea at sunset."],
                     ["Drone footage flying over a futuristic city with flying cars."],