Wan-2.1-T2V-1.3B-GPU

Paused

App Files Files Community

markury commited on Mar 19

Commit

cf345e6

1 Parent(s): b17912c

test: ui

Browse files

Files changed (2) hide show

app.py +228 -4
requirements.txt +7 -0

app.py CHANGED Viewed

@@ -1,7 +1,231 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
-demo.launch()

+import torch
 import gradio as gr
+import spaces
+from diffusers.utils import export_to_video
+from diffusers import AutoencoderKLWan, WanPipeline
+from diffusers.schedulers.scheduling_unipc_multistep import UniPCMultistepScheduler
+from diffusers.schedulers.scheduling_flow_match_euler_discrete import FlowMatchEulerDiscreteScheduler
+# Define model options
+MODEL_OPTIONS = {
+    "Wan2.1-T2V-1.3B": "Wan-AI/Wan2.1-T2V-1.3B-Diffusers",
+    "Wan2.1-T2V-14B": "Wan-AI/Wan2.1-T2V-14B-Diffusers"
+}
+# Define scheduler options
+SCHEDULER_OPTIONS = {
+    "UniPCMultistepScheduler": UniPCMultistepScheduler,
+    "FlowMatchEulerDiscreteScheduler": FlowMatchEulerDiscreteScheduler
+}
+@spaces.GPU(duration=300)  # Set a 5-minute duration for the GPU access
+def generate_video(
+    model_choice,
+    prompt,
+    negative_prompt,
+    lora_id,
+    lora_scale,
+    scheduler_type,
+    flow_shift,
+    height,
+    width,
+    num_frames,
+    guidance_scale,
+    num_inference_steps,
+    output_fps
+):
+    """Generate a video using the Wan model and provided parameters"""
+    try:
+        # Get model ID from selection
+        model_id = MODEL_OPTIONS[model_choice]
+        # Load the model components
+        vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)
+        pipe = WanPipeline.from_pretrained(model_id, vae=vae, torch_dtype=torch.bfloat16)
+        # Set the scheduler
+        scheduler_class = SCHEDULER_OPTIONS[scheduler_type]
+        if scheduler_type == "UniPCMultistepScheduler":
+            pipe.scheduler = scheduler_class.from_config(
+                pipe.scheduler.config,
+                prediction_type="flow_prediction",
+                use_flow_sigmas=True,
+                flow_shift=flow_shift
+            )
+        else:
+            pipe.scheduler = scheduler_class(shift=flow_shift)
+        # Move to GPU
+        pipe.to("cuda")
+        # Enable CPU offload for low VRAM
+        pipe.enable_model_cpu_offload()
+        # Load and fuse LoRA if provided
+        if lora_id and lora_id.strip():
+            try:
+                # Load the LoRA weights
+                pipe.load_lora_weights(lora_id)
+                # Fuse LoRA with specified scale if available
+                if hasattr(pipe, "fuse_lora"):
+                    pipe.fuse_lora(lora_scale=lora_scale)
+            except Exception as e:
+                return f"Error loading/fusing LoRA: {str(e)}"
+        # Generate the video
+        output = pipe(
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            height=height,
+            width=width,
+            num_frames=num_frames,
+            guidance_scale=guidance_scale,
+            num_inference_steps=num_inference_steps
+        ).frames[0]
+        # Export to video
+        temp_file = "output.mp4"
+        export_to_video(output, temp_file, fps=output_fps)
+        return temp_file
+    except Exception as e:
+        return f"Error generating video: {str(e)}"
+# Create the Gradio interface
+with gr.Blocks() as demo:
+    gr.Markdown("# Wan Video Generation with ZeroGPU")
+    gr.Markdown("Generate high-quality videos using the Wan model with optional LoRA adaptations.")
+    with gr.Row():
+        with gr.Column(scale=1):
+            model_choice = gr.Dropdown(
+                choices=list(MODEL_OPTIONS.keys()),
+                value="Wan2.1-T2V-1.3B",
+                label="Model"
+            )
+            prompt = gr.Textbox(
+                label="Prompt",
+                value="steamboat willie style, golden era animation, an anthropomorphic cat character wearing a hat removes it and performs a courteous bow",
+                lines=3
+            )
+            negative_prompt = gr.Textbox(
+                label="Negative Prompt",
+                value="色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走",
+                lines=3
+            )
+            with gr.Row():
+                lora_id = gr.Textbox(
+                    label="LoRA ID (e.g., benjamin-paine/steamboat-willie-1.3b)",
+                    value="benjamin-paine/steamboat-willie-1.3b"
+                )
+                lora_scale = gr.Slider(
+                    label="LoRA Scale",
+                    minimum=0.0,
+                    maximum=1.0,
+                    value=0.75,
+                    step=0.05
+                )
+            with gr.Row():
+                scheduler_type = gr.Dropdown(
+                    choices=list(SCHEDULER_OPTIONS.keys()),
+                    value="UniPCMultistepScheduler",
+                    label="Scheduler"
+                )
+                flow_shift = gr.Slider(
+                    label="Flow Shift",
+                    minimum=1.0,
+                    maximum=12.0,
+                    value=3.0,
+                    step=0.5,
+                    info="2.0-5.0 for smaller videos, 7.0-12.0 for larger videos"
+                )
+            with gr.Row():
+                height = gr.Slider(
+                    label="Height",
+                    minimum=256,
+                    maximum=1024,
+                    value=480,
+                    step=32
+                )
+                width = gr.Slider(
+                    label="Width",
+                    minimum=256,
+                    maximum=1792,
+                    value=832,
+                    step=32
+                )
+            with gr.Row():
+                num_frames = gr.Slider(
+                    label="Number of Frames (4k+1 is recommended, e.g. 81)",
+                    minimum=17,
+                    maximum=129,
+                    value=81,
+                    step=4
+                )
+                output_fps = gr.Slider(
+                    label="Output FPS",
+                    minimum=8,
+                    maximum=30,
+                    value=16,
+                    step=1
+                )
+            with gr.Row():
+                guidance_scale = gr.Slider(
+                    label="Guidance Scale (CFG)",
+                    minimum=1.0,
+                    maximum=15.0,
+                    value=5.0,
+                    step=0.5
+                )
+                num_inference_steps = gr.Slider(
+                    label="Inference Steps",
+                    minimum=10,
+                    maximum=100,
+                    value=32,
+                    step=1
+                )
+            generate_btn = gr.Button("Generate Video")
+        with gr.Column(scale=1):
+            output_video = gr.Video(label="Generated Video")
+    generate_btn.click(
+        fn=generate_video,
+        inputs=[
+            model_choice,
+            prompt,
+            negative_prompt,
+            lora_id,
+            lora_scale,
+            scheduler_type,
+            flow_shift,
+            height,
+            width,
+            num_frames,
+            guidance_scale,
+            num_inference_steps,
+            output_fps
+        ],
+        outputs=output_video
+    )
+    gr.Markdown("""
+    ## Tips for best results:
+    - For smaller resolution videos, try lower values of flow shift (2.0-5.0)
+    - For larger resolution videos, try higher values of flow shift (7.0-12.0)
+    - Number of frames should be of the form 4k+1 (e.g., 49, 81, 65)
+    - The model is memory intensive, so adjust resolution according to available VRAM
+    - LoRA ID should be a Hugging Face repository containing safetensors files
+    """)
+demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+git+https://github.com/huggingface/diffusers.git
+transformers
+accelerate
+safetensors
+torch>=2.0.1
+gradio
+spaces