het / app.py
DEMONMO's picture
Rename video_generator.py to app.py
e82f1c2 verified
import torch
from diffusers import LTXConditionPipeline, LTXLatentUpsamplePipeline
from diffusers.pipelines.ltx.pipeline_ltx_condition import LTXVideoCondition
from diffusers.utils import export_to_video
pipe = LTXConditionPipeline.from_pretrained("Lightricks/LTX-Video-0.9.7-dev", torch_dtype=torch.bfloat16)
pipe_upsample = LTXLatentUpsamplePipeline.from_pretrained("Lightricks/ltxv-spatial-upscaler-0.9.7", vae=pipe.vae, torch_dtype=torch.bfloat16)
pipe.to("cuda")
pipe_upsample.to("cuda")
pipe.vae.enable_tiling()
prompt = "The video depicts a winding mountain road covered in snow, with a single vehicle traveling along it. The road is flanked by steep, rocky cliffs and sparse vegetation. The landscape is characterized by rugged terrain and a river visible in the distance. The scene captures the solitude and beauty of a winter drive through a mountainous region."
negative_prompt = "worst quality, inconsistent motion, blurry, jittery, distorted"
expected_height, expected_width = 704, 512
downscale_factor = 2 / 3
num_frames = 121
# Part 1. Generate video at smaller resolution
downscaled_height, downscaled_width = int(expected_height * downscale_factor), int(expected_width * downscale_factor)
latents = pipe(
conditions=None,
prompt=prompt,
negative_prompt=negative_prompt,
width=downscaled_width,
height=downscaled_height,
num_frames=num_frames,
num_inference_steps=30,
generator=torch.Generator().manual_seed(0),
output_type="latent",
).frames
# Part 2. Upscale generated video using latent upsampler with fewer inference steps
# The available latent upsampler upscales the height/width by 2x
upscaled_height, upscaled_width = downscaled_height * 2, downscaled_width * 2
upscaled_latents = pipe_upsample(
latents=latents,
output_type="latent"
).frames
# Part 3. Denoise the upscaled video with few steps to improve texture (optional, but recommended)
video = pipe(
prompt=prompt,
negative_prompt=negative_prompt,
width=upscaled_width,
height=upscaled_height,
num_frames=num_frames,
denoise_strength=0.4, # Effectively, 4 inference steps out of 10
num_inference_steps=10,
latents=upscaled_latents,
decode_timestep=0.05,
image_cond_noise_scale=0.025,
generator=torch.Generator().manual_seed(0),
output_type="pil",
).frames[0]
# Part 4. Downscale the video to the expected resolution
video = [frame.resize((expected_width, expected_height)) for frame in video]
export_to_video(video, "output.mp4", fps=24)
import torch
import gradio as gr
from diffusers import LTXConditionPipeline, LTXLatentUpsamplePipeline
from diffusers.pipelines.ltx.pipeline_ltx_condition import LTXVideoCondition
from diffusers.utils import export_to_video
def generate_video(
prompt,
negative_prompt,
expected_height,
expected_width,
downscale_factor,
num_frames,
num_inference_steps,
denoise_strength,
seed,
progress=gr.Progress()
):
# Initialize pipelines (move this outside the function for production)
progress(0.1, desc="Loading models...")
pipe = LTXConditionPipeline.from_pretrained("Lightricks/LTX-Video-0.9.7-dev", torch_dtype=torch.bfloat16)
pipe_upsample = LTXLatentUpsamplePipeline.from_pretrained("Lightricks/ltxv-spatial-upscaler-0.9.7", vae=pipe.vae, torch_dtype=torch.bfloat16)
pipe.to("cuda")
pipe_upsample.to("cuda")
pipe.vae.enable_tiling()
# Part 1. Generate video at smaller resolution
progress(0.2, desc="Generating initial video...")
downscaled_height, downscaled_width = int(expected_height * downscale_factor), int(expected_width * downscale_factor)
generator = torch.Generator().manual_seed(seed)
latents = pipe(
conditions=None,
prompt=prompt,
negative_prompt=negative_prompt,
width=downscaled_width,
height=downscaled_height,
num_frames=num_frames,
num_inference_steps=num_inference_steps,
generator=generator,
output_type="latent",
).frames
# Part 2. Upscale generated video
progress(0.5, desc="Upscaling video...")
upscaled_height, upscaled_width = downscaled_height * 2, downscaled_width * 2
upscaled_latents = pipe_upsample(
latents=latents,
output_type="latent"
).frames
# Part 3. Denoise the upscaled video
progress(0.7, desc="Refining video quality...")
video = pipe(
prompt=prompt,
negative_prompt=negative_prompt,
width=upscaled_width,
height=upscaled_height,
num_frames=num_frames,
denoise_strength=denoise_strength,
num_inference_steps=10,
latents=upscaled_latents,
decode_timestep=0.05,
image_cond_noise_scale=0.025,
generator=generator,
output_type="pil",
).frames[0]
# Part 4. Downscale the video to the expected resolution
progress(0.9, desc="Finalizing video...")
video = [frame.resize((expected_width, expected_height)) for frame in video]
# Save and return video
output_path = "output.mp4"
export_to_video(video, output_path, fps=24)
return output_path
# Create Gradio interface
with gr.Blocks(title="LTX Video Generator") as demo:
gr.Markdown("# LTX Video Generator")
gr.Markdown("Generate videos from text prompts using Lightricks' LTX model")
with gr.Row():
with gr.Column():
prompt = gr.Textbox(
label="Prompt",
value="The video depicts a winding mountain road covered in snow, with a single vehicle traveling along it. The road is flanked by steep, rocky cliffs and sparse vegetation. The landscape is characterized by rugged terrain and a river visible in the distance. The scene captures the solitude and beauty of a winter drive through a mountainous region.",
lines=4
)
negative_prompt = gr.Textbox(
label="Negative Prompt",
value="worst quality, inconsistent motion, blurry, jittery, distorted",
lines=2
)
with gr.Row():
expected_height = gr.Slider(
label="Output Height",
minimum=256,
maximum=1024,
step=64,
value=704
)
expected_width = gr.Slider(
label="Output Width",
minimum=256,
maximum=1024,
step=64,
value=512
)
with gr.Row():
downscale_factor = gr.Slider(
label="Initial Downscale Factor",
minimum=0.3,
maximum=0.9,
step=0.05,
value=2/3
)
num_frames = gr.Slider(
label="Number of Frames",
minimum=24,
maximum=240,
step=1,
value=121
)
with gr.Row():
num_inference_steps = gr.Slider(
label="Inference Steps",
minimum=10,
maximum=50,
step=1,
value=30
)
denoise_strength = gr.Slider(
label="Denoise Strength",
minimum=0.1,
maximum=0.9,
step=0.05,
value=0.4
)
seed = gr.Number(
label="Seed",
value=0,
precision=0
)
submit_btn = gr.Button("Generate Video", variant="primary")
with gr.Column():
output_video = gr.Video(label="Generated Video")
submit_btn.click(
fn=generate_video,
inputs=[
prompt,
negative_prompt,
expected_height,
expected_width,
downscale_factor,
num_frames,
num_inference_steps,
denoise_strength,
seed
],
outputs=output_video
)
if __name__ == "__main__":
demo.launch()