Wan-2.1-T2V-1.3B-GPU

Paused

App Files Files Community

markury commited on Mar 19

Commit

10b0bca

1 Parent(s): 08b4ec0

fix(wip): second pass

Browse files

Files changed (1) hide show

app.py +61 -38

app.py CHANGED Viewed

@@ -7,6 +7,7 @@ from diffusers.schedulers.scheduling_unipc_multistep import UniPCMultistepSchedu
 from diffusers.schedulers.scheduling_flow_match_euler_discrete import FlowMatchEulerDiscreteScheduler
 import os
 import tempfile
 # Define model options
 MODEL_OPTIONS = {
@@ -43,7 +44,7 @@ def generate_video(
     second_pass_flow_shift,
     second_pass_cfg,
     show_both_outputs
-):
     # Get model ID from selection
     model_id = MODEL_OPTIONS[model_choice]
@@ -98,23 +99,16 @@ def generate_video(
         num_frames=num_frames,
         guidance_scale=guidance_scale,
         num_inference_steps=num_inference_steps,
-        output_type="latent" if enable_second_pass else "pt",  # Only return latents if doing second pass
         return_dict=True
     )
-    # Get the latents from the first pass output
-    latents = first_pass.frames[0]
-    # If we're not doing a second pass or need to display both outputs, decode the first pass
     if not enable_second_pass or (enable_second_pass and show_both_outputs):
-        # Decode the latents to frames with the VAE (only needed if we requested latents)
-        if enable_second_pass:
-            print("Decoding first pass latents...")
-            with torch.no_grad():
-                first_pass_frames = pipe.vae.decode(latents / pipe.vae.config.scaling_factor).sample
-        else:
-            first_pass_frames = latents
         # Export first pass to video
         first_pass_file = "output_first_pass.mp4"
         export_to_video(first_pass_frames, first_pass_file, fps=output_fps)
@@ -125,6 +119,14 @@ def generate_video(
     if enable_second_pass:
         print("Running second pass with scale factor:", second_pass_scale)
         # Resize latents for second pass (upscale)
         new_height = int(height * second_pass_scale)
         new_width = int(width * second_pass_scale)
@@ -135,10 +137,18 @@ def generate_video(
         print(f"Upscaling latents from {height}x{width} to {new_height}x{new_width}")
         # Upscale latents using interpolate
         upscaled_latents = torch.nn.functional.interpolate(
             latents,
-            size=(num_frames, new_height // 8, new_width // 8),  # VAE downsamples by factor of 8
             mode="trilinear",
             align_corners=False
         )
@@ -183,15 +193,18 @@ def generate_video(
         output_files.append(second_pass_file)
     # Return the appropriate video output(s)
-    if enable_second_pass and not show_both_outputs:
-        return second_pass_file
-    elif enable_second_pass and show_both_outputs:
-        return [first_pass_file, second_pass_file]
     else:
-        return first_pass_file
-# Create the Gradio interface
 with gr.Blocks() as demo:
     gr.HTML("""
     <p align="center">
     <svg version="1.1" viewBox="0 0 1200 295" xmlns="http://www.w3.org/2000/svg" xmlns:v="https://vecta.io/nano" width="400">
@@ -364,34 +377,33 @@ with gr.Blocks() as demo:
                 output_video = gr.Video(label="Generated Video")
                 second_output_video = gr.Video(label="Second Pass Video", visible=False)
-                # Show/hide second video based on checkbox
-                def update_second_video_visibility(enable_pass, show_both):
-                    return {"visible": enable_pass and show_both}
                 enable_second_pass.change(
-                    fn=update_second_video_visibility,
                     inputs=[enable_second_pass, show_both_outputs],
                     outputs=[second_output_video]
                 )
                 show_both_outputs.change(
-                    fn=update_second_video_visibility,
                     inputs=[enable_second_pass, show_both_outputs],
                     outputs=[second_output_video]
                 )
-    # Updated function to handle the second pass and multiple outputs
-    def process_generation(*args):
-        result = generate_video(*args)
-        if isinstance(result, list) and len(result) > 1:
-            return [result[0], result[1], {"visible": True}]
-        elif isinstance(result, list) and len(result) == 1:
-            return [result[0], None, {"visible": False}]
         else:
-            return [result, None, {"visible": False}]
     generate_btn.click(
-        fn=process_generation,
         inputs=[
             model_choice,
             prompt,
@@ -416,12 +428,23 @@ with gr.Blocks() as demo:
             show_both_outputs
         ],
         outputs=[
-            output_video,
-            second_output_video,
-            second_output_video  # Update visibility
         ]
     )
     gr.Markdown("""
     ## Tips for best results:
     - For smaller resolution videos, try lower values of flow shift (2.0-5.0)

 from diffusers.schedulers.scheduling_flow_match_euler_discrete import FlowMatchEulerDiscreteScheduler
 import os
 import tempfile
+from typing import List, Union, Optional
 # Define model options
 MODEL_OPTIONS = {
     second_pass_flow_shift,
     second_pass_cfg,
     show_both_outputs
+) -> Union[str, List[str]]:
     # Get model ID from selection
     model_id = MODEL_OPTIONS[model_choice]
         num_frames=num_frames,
         guidance_scale=guidance_scale,
         num_inference_steps=num_inference_steps,
+        # For Wan, we may need to approach this differently for the latents
+        output_type="pt",  # Always get PyTorch tensors for the first pass
         return_dict=True
     )
+    # Get the frames or latents from the first pass output
+    first_pass_frames = first_pass.frames[0]
+    # Output the first pass video if needed
     if not enable_second_pass or (enable_second_pass and show_both_outputs):
         # Export first pass to video
         first_pass_file = "output_first_pass.mp4"
         export_to_video(first_pass_frames, first_pass_file, fps=output_fps)
     if enable_second_pass:
         print("Running second pass with scale factor:", second_pass_scale)
+        # For second pass, we need to first encode the frames to get latents
+        print("Encoding first pass frames to latents...")
+        with torch.no_grad():
+            # Move frames to the same device as the VAE
+            first_pass_frames = first_pass_frames.to(pipe.vae.device)
+            # Encode to get latents
+            latents = pipe.vae.encode(first_pass_frames).latent_dist.sample()
         # Resize latents for second pass (upscale)
         new_height = int(height * second_pass_scale)
         new_width = int(width * second_pass_scale)
         print(f"Upscaling latents from {height}x{width} to {new_height}x{new_width}")
+        # Get latent dimensions
+        latent_height = latents.shape[2]  # Should be height//8
+        latent_width = latents.shape[3]   # Should be width//8
+        # Calculate new latent dimensions
+        new_latent_height = new_height // 8
+        new_latent_width = new_width // 8
         # Upscale latents using interpolate
         upscaled_latents = torch.nn.functional.interpolate(
             latents,
+            size=(num_frames, new_latent_height, new_latent_width),
             mode="trilinear",
             align_corners=False
         )
         output_files.append(second_pass_file)
     # Return the appropriate video output(s)
+    if enable_second_pass and show_both_outputs and len(output_files) > 1:
+        return output_files
+    elif len(output_files) > 0:
+        return output_files[-1]  # Return the last generated output (either first or second pass)
     else:
+        return "No video was generated. Please check the logs for errors."
+    # Create the Gradio interface
 with gr.Blocks() as demo:
+    # Import gr.update for visibility control
+    from gradio import update
     gr.HTML("""
     <p align="center">
     <svg version="1.1" viewBox="0 0 1200 295" xmlns="http://www.w3.org/2000/svg" xmlns:v="https://vecta.io/nano" width="400">
                 output_video = gr.Video(label="Generated Video")
                 second_output_video = gr.Video(label="Second Pass Video", visible=False)
+                # Control visibility through the UI changes directly
+                def toggle_second_video(enable_pass, show_both):
+                    return gr.update(visible=enable_pass and show_both)
+                # Update visibility when checkboxes change
                 enable_second_pass.change(
+                    fn=toggle_second_video,
                     inputs=[enable_second_pass, show_both_outputs],
                     outputs=[second_output_video]
                 )
                 show_both_outputs.change(
+                    fn=toggle_second_video,
                     inputs=[enable_second_pass, show_both_outputs],
                     outputs=[second_output_video]
                 )
+    # Define a visibility update function separately
+    def update_second_video_visibility(enable_pass, show_both):
+        if enable_pass and show_both:
+            return gr.update(visible=True)
         else:
+            return gr.update(visible=False)
+    # Process generation without trying to update visibility in the same function
     generate_btn.click(
+        fn=generate_video,
         inputs=[
             model_choice,
             prompt,
             show_both_outputs
         ],
         outputs=[
+            output_video if not show_both_outputs else [output_video, second_output_video]
         ]
     )
+    # Update visibility when options change
+    enable_second_pass.change(
+        fn=update_second_video_visibility,
+        inputs=[enable_second_pass, show_both_outputs],
+        outputs=[second_output_video]
+    )
+    show_both_outputs.change(
+        fn=update_second_video_visibility,
+        inputs=[enable_second_pass, show_both_outputs],
+        outputs=[second_output_video]
+    )
     gr.Markdown("""
     ## Tips for best results:
     - For smaller resolution videos, try lower values of flow shift (2.0-5.0)