import torch import gradio as gr import spaces from diffusers.utils import export_to_video from diffusers import AutoencoderKLWan, WanPipeline from diffusers.schedulers.scheduling_unipc_multistep import UniPCMultistepScheduler from diffusers.schedulers.scheduling_flow_match_euler_discrete import FlowMatchEulerDiscreteScheduler # Define model options MODEL_OPTIONS = { "Wan2.1-T2V-1.3B": "Wan-AI/Wan2.1-T2V-1.3B-Diffusers", "Wan2.1-T2V-14B": "Wan-AI/Wan2.1-T2V-14B-Diffusers" } # Define scheduler options SCHEDULER_OPTIONS = { "UniPCMultistepScheduler": UniPCMultistepScheduler, "FlowMatchEulerDiscreteScheduler": FlowMatchEulerDiscreteScheduler } @spaces.GPU(duration=300) # Set a 5-minute duration for the GPU access def generate_video( model_choice, prompt, negative_prompt, lora_id, lora_scale, scheduler_type, flow_shift, height, width, num_frames, guidance_scale, num_inference_steps, output_fps ): """Generate a video using the Wan model and provided parameters""" try: # Get model ID from selection model_id = MODEL_OPTIONS[model_choice] # Load the model components vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32) pipe = WanPipeline.from_pretrained(model_id, vae=vae, torch_dtype=torch.bfloat16) # Set the scheduler scheduler_class = SCHEDULER_OPTIONS[scheduler_type] if scheduler_type == "UniPCMultistepScheduler": pipe.scheduler = scheduler_class.from_config( pipe.scheduler.config, prediction_type="flow_prediction", use_flow_sigmas=True, flow_shift=flow_shift ) else: pipe.scheduler = scheduler_class(shift=flow_shift) # Move to GPU pipe.to("cuda") # Enable CPU offload for low VRAM pipe.enable_model_cpu_offload() # Load and fuse LoRA if provided if lora_id and lora_id.strip(): try: # Load the LoRA weights pipe.load_lora_weights(lora_id) # Fuse LoRA with specified scale if available if hasattr(pipe, "fuse_lora"): pipe.fuse_lora(lora_scale=lora_scale) except Exception as e: return f"Error loading/fusing LoRA: {str(e)}" # Generate the video output = pipe( prompt=prompt, negative_prompt=negative_prompt, height=height, width=width, num_frames=num_frames, guidance_scale=guidance_scale, num_inference_steps=num_inference_steps ).frames[0] # Export to video temp_file = "output.mp4" export_to_video(output, temp_file, fps=output_fps) return temp_file except Exception as e: return f"Error generating video: {str(e)}" # Create the Gradio interface with gr.Blocks() as demo: gr.Markdown("# Wan Video Generation with ZeroGPU") gr.Markdown("Generate high-quality videos using the Wan model with optional LoRA adaptations.") with gr.Row(): with gr.Column(scale=1): model_choice = gr.Dropdown( choices=list(MODEL_OPTIONS.keys()), value="Wan2.1-T2V-1.3B", label="Model" ) prompt = gr.Textbox( label="Prompt", value="steamboat willie style, golden era animation, an anthropomorphic cat character wearing a hat removes it and performs a courteous bow", lines=3 ) negative_prompt = gr.Textbox( label="Negative Prompt", value="色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走", lines=3 ) with gr.Row(): lora_id = gr.Textbox( label="LoRA ID (e.g., benjamin-paine/steamboat-willie-1.3b)", value="benjamin-paine/steamboat-willie-1.3b" ) lora_scale = gr.Slider( label="LoRA Scale", minimum=0.0, maximum=1.0, value=0.75, step=0.05 ) with gr.Row(): scheduler_type = gr.Dropdown( choices=list(SCHEDULER_OPTIONS.keys()), value="UniPCMultistepScheduler", label="Scheduler" ) flow_shift = gr.Slider( label="Flow Shift", minimum=1.0, maximum=12.0, value=3.0, step=0.5, info="2.0-5.0 for smaller videos, 7.0-12.0 for larger videos" ) with gr.Row(): height = gr.Slider( label="Height", minimum=256, maximum=1024, value=480, step=32 ) width = gr.Slider( label="Width", minimum=256, maximum=1792, value=832, step=32 ) with gr.Row(): num_frames = gr.Slider( label="Number of Frames (4k+1 is recommended, e.g. 81)", minimum=17, maximum=129, value=81, step=4 ) output_fps = gr.Slider( label="Output FPS", minimum=8, maximum=30, value=16, step=1 ) with gr.Row(): guidance_scale = gr.Slider( label="Guidance Scale (CFG)", minimum=1.0, maximum=15.0, value=5.0, step=0.5 ) num_inference_steps = gr.Slider( label="Inference Steps", minimum=10, maximum=100, value=32, step=1 ) generate_btn = gr.Button("Generate Video") with gr.Column(scale=1): output_video = gr.Video(label="Generated Video") generate_btn.click( fn=generate_video, inputs=[ model_choice, prompt, negative_prompt, lora_id, lora_scale, scheduler_type, flow_shift, height, width, num_frames, guidance_scale, num_inference_steps, output_fps ], outputs=output_video ) gr.Markdown(""" ## Tips for best results: - For smaller resolution videos, try lower values of flow shift (2.0-5.0) - For larger resolution videos, try higher values of flow shift (7.0-12.0) - Number of frames should be of the form 4k+1 (e.g., 49, 81, 65) - The model is memory intensive, so adjust resolution according to available VRAM - LoRA ID should be a Hugging Face repository containing safetensors files """) demo.launch()