Spaces:

ali-vilab
/

modelscope-text-to-video-synthesis

Runtime error

File size: 5,382 Bytes

#!/usr/bin/env python

from __future__ import annotations

import os
import random
import tempfile
import sys

# Check critical dependencies before proceeding
try:
    import numpy as np
    import torch
    import gradio as gr
    import imageio
    from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler
except ImportError as e:
    print(f"Error: Missing required dependency - {e}")
    print("Please ensure requirements.txt includes: numpy, torch, diffusers, gradio, imageio")
    sys.exit(1)

DESCRIPTION = '''# [ModelScope Text to Video Synthesis](https://modelscope.cn/models/damo/text-to-video-synthesis/summary)
<p>For Colab usage, you can view <a href="https://colab.research.google.com/drive/1uW1ZqswkQ9Z9bp5Nbo5z59cAn7I0hE6R?usp=sharing" style="text-decoration: underline;" target="_blank">this webpage</a>.</p>
<p>This model can only be used for non-commercial purposes. See the <a href="https://huggingface.co/damo-vilab/modelscope-damo-text-to-video-synthesis" style="text-decoration: underline;" target="_blank">model card</a>.</p>'''

if (SPACE_ID := os.getenv('SPACE_ID')) is not None:
    DESCRIPTION += f'''\n<p>For faster inference, you may duplicate this space and upgrade to GPU. 
    <a href="https://huggingface.co/spaces/{SPACE_ID}?duplicate=true">
    <img style="display: inline; margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a></p>'''

MAX_NUM_FRAMES = int(os.getenv('MAX_NUM_FRAMES', '64'))  # Reduced from 200 for stability
DEFAULT_NUM_FRAMES = min(MAX_NUM_FRAMES, 16)

# Initialize pipeline with error handling
try:
    pipe = DiffusionPipeline.from_pretrained(
        'damo-vilab/text-to-video-ms-1.7b',
        torch_dtype=torch.float16,
        variant='fp16'
    )
    pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
    pipe.enable_model_cpu_offload()
    pipe.enable_vae_slicing()
except Exception as e:
    print(f"Failed to initialize pipeline: {e}")
    print("This model requires significant GPU memory. Try a smaller model like 'cerspense/zeroscope_v2_576w' if needed.")
    sys.exit(1)

def to_video(frames: list[np.ndarray], fps: int) -> str:
    """Convert frames to video using imageio with FFMPEG."""
    try:
        out_file = tempfile.NamedTemporaryFile(suffix='.mp4', delete=False)
        writer = imageio.get_writer(out_file.name, format='FFMPEG', fps=fps)
        for frame in frames:
            writer.append_data(frame)
        writer.close()
        return out_file.name
    except Exception as e:
        print(f"Video creation failed: {e}")
        raise

def generate(prompt: str, seed: int, num_frames: int, num_inference_steps: int) -> str:
    """Generate video from text prompt."""
    if not prompt.strip():
        raise gr.Error("Please enter a valid prompt")
    
    seed = random.randint(0, 1000000) if seed == -1 else seed
    generator = torch.Generator().manual_seed(seed)
    
    try:
        frames = pipe(
            prompt,
            num_inference_steps=num_inference_steps,
            num_frames=num_frames,
            generator=generator
        ).frames
        return to_video(frames, 8)
    except torch.cuda.OutOfMemoryError:
        raise gr.Error("Out of GPU memory - Try reducing frame count or use a smaller model")
    except Exception as e:
        raise gr.Error(f"Generation failed: {str(e)}")

examples = [
    ['An astronaut riding a horse.', 0, 16, 25],
    ['A panda eating bamboo on a rock.', 0, 16, 25],
    ['Spiderman is surfing.', 0, 16, 25],
]

with gr.Blocks(css='style.css') as demo:
    gr.Markdown(DESCRIPTION)
    
    with gr.Group():
        with gr.Box():
            with gr.Row(elem_id='prompt-container').style(equal_height=True):
                prompt = gr.Text(
                    label='Prompt',
                    show_label=False,
                    max_lines=1,
                    placeholder='Enter your prompt',
                    elem_id='prompt-text-input'
                )
                run_button = gr.Button('Generate video')
        
        result = gr.Video(label='Result', show_label=False)
        
        with gr.Accordion('Advanced options', open=False):
            seed = gr.Slider(
                label='Seed',
                minimum=-1,
                maximum=1000000,
                step=1,
                value=-1,
                info='-1 = random seed each time'
            )
            num_frames = gr.Slider(
                label='Number of frames',
                minimum=16,
                maximum=MAX_NUM_FRAMES,
                step=1,
                value=DEFAULT_NUM_FRAMES,
                info='Higher values require more GPU memory'
            )
            num_inference_steps = gr.Slider(
                label='Inference steps',
                minimum=10,
                maximum=50,
                step=1,
                value=25
            )

    inputs = [prompt, seed, num_frames, num_inference_steps]
    
    gr.Examples(
        examples=examples,
        inputs=inputs,
        outputs=result,
        fn=generate,
        cache_examples=os.getenv('SYSTEM') == 'spaces'
    )

    prompt.submit(fn=generate, inputs=inputs, outputs=result)
    run_button.click(fn=generate, inputs=inputs, outputs=result)
    
    # Additional UI sections remain unchanged...
    
demo.queue(max_size=10).launch()