|
import torch |
|
from diffusers import LTXConditionPipeline, LTXLatentUpsamplePipeline |
|
from diffusers.pipelines.ltx.pipeline_ltx_condition import LTXVideoCondition |
|
from diffusers.utils import export_to_video |
|
|
|
pipe = LTXConditionPipeline.from_pretrained("Lightricks/LTX-Video-0.9.7-dev", torch_dtype=torch.bfloat16) |
|
pipe_upsample = LTXLatentUpsamplePipeline.from_pretrained("Lightricks/ltxv-spatial-upscaler-0.9.7", vae=pipe.vae, torch_dtype=torch.bfloat16) |
|
pipe.to("cuda") |
|
pipe_upsample.to("cuda") |
|
pipe.vae.enable_tiling() |
|
|
|
prompt = "The video depicts a winding mountain road covered in snow, with a single vehicle traveling along it. The road is flanked by steep, rocky cliffs and sparse vegetation. The landscape is characterized by rugged terrain and a river visible in the distance. The scene captures the solitude and beauty of a winter drive through a mountainous region." |
|
negative_prompt = "worst quality, inconsistent motion, blurry, jittery, distorted" |
|
expected_height, expected_width = 704, 512 |
|
downscale_factor = 2 / 3 |
|
num_frames = 121 |
|
|
|
|
|
downscaled_height, downscaled_width = int(expected_height * downscale_factor), int(expected_width * downscale_factor) |
|
latents = pipe( |
|
conditions=None, |
|
prompt=prompt, |
|
negative_prompt=negative_prompt, |
|
width=downscaled_width, |
|
height=downscaled_height, |
|
num_frames=num_frames, |
|
num_inference_steps=30, |
|
generator=torch.Generator().manual_seed(0), |
|
output_type="latent", |
|
).frames |
|
|
|
|
|
|
|
upscaled_height, upscaled_width = downscaled_height * 2, downscaled_width * 2 |
|
upscaled_latents = pipe_upsample( |
|
latents=latents, |
|
output_type="latent" |
|
).frames |
|
|
|
|
|
video = pipe( |
|
prompt=prompt, |
|
negative_prompt=negative_prompt, |
|
width=upscaled_width, |
|
height=upscaled_height, |
|
num_frames=num_frames, |
|
denoise_strength=0.4, |
|
num_inference_steps=10, |
|
latents=upscaled_latents, |
|
decode_timestep=0.05, |
|
image_cond_noise_scale=0.025, |
|
generator=torch.Generator().manual_seed(0), |
|
output_type="pil", |
|
).frames[0] |
|
|
|
|
|
video = [frame.resize((expected_width, expected_height)) for frame in video] |
|
|
|
export_to_video(video, "output.mp4", fps=24) |
|
import torch |
|
import gradio as gr |
|
from diffusers import LTXConditionPipeline, LTXLatentUpsamplePipeline |
|
from diffusers.pipelines.ltx.pipeline_ltx_condition import LTXVideoCondition |
|
from diffusers.utils import export_to_video |
|
|
|
def generate_video( |
|
prompt, |
|
negative_prompt, |
|
expected_height, |
|
expected_width, |
|
downscale_factor, |
|
num_frames, |
|
num_inference_steps, |
|
denoise_strength, |
|
seed, |
|
progress=gr.Progress() |
|
): |
|
|
|
progress(0.1, desc="Loading models...") |
|
pipe = LTXConditionPipeline.from_pretrained("Lightricks/LTX-Video-0.9.7-dev", torch_dtype=torch.bfloat16) |
|
pipe_upsample = LTXLatentUpsamplePipeline.from_pretrained("Lightricks/ltxv-spatial-upscaler-0.9.7", vae=pipe.vae, torch_dtype=torch.bfloat16) |
|
pipe.to("cuda") |
|
pipe_upsample.to("cuda") |
|
pipe.vae.enable_tiling() |
|
|
|
|
|
progress(0.2, desc="Generating initial video...") |
|
downscaled_height, downscaled_width = int(expected_height * downscale_factor), int(expected_width * downscale_factor) |
|
generator = torch.Generator().manual_seed(seed) |
|
|
|
latents = pipe( |
|
conditions=None, |
|
prompt=prompt, |
|
negative_prompt=negative_prompt, |
|
width=downscaled_width, |
|
height=downscaled_height, |
|
num_frames=num_frames, |
|
num_inference_steps=num_inference_steps, |
|
generator=generator, |
|
output_type="latent", |
|
).frames |
|
|
|
|
|
progress(0.5, desc="Upscaling video...") |
|
upscaled_height, upscaled_width = downscaled_height * 2, downscaled_width * 2 |
|
upscaled_latents = pipe_upsample( |
|
latents=latents, |
|
output_type="latent" |
|
).frames |
|
|
|
|
|
progress(0.7, desc="Refining video quality...") |
|
video = pipe( |
|
prompt=prompt, |
|
negative_prompt=negative_prompt, |
|
width=upscaled_width, |
|
height=upscaled_height, |
|
num_frames=num_frames, |
|
denoise_strength=denoise_strength, |
|
num_inference_steps=10, |
|
latents=upscaled_latents, |
|
decode_timestep=0.05, |
|
image_cond_noise_scale=0.025, |
|
generator=generator, |
|
output_type="pil", |
|
).frames[0] |
|
|
|
|
|
progress(0.9, desc="Finalizing video...") |
|
video = [frame.resize((expected_width, expected_height)) for frame in video] |
|
|
|
|
|
output_path = "output.mp4" |
|
export_to_video(video, output_path, fps=24) |
|
|
|
return output_path |
|
|
|
|
|
with gr.Blocks(title="LTX Video Generator") as demo: |
|
gr.Markdown("# LTX Video Generator") |
|
gr.Markdown("Generate videos from text prompts using Lightricks' LTX model") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
prompt = gr.Textbox( |
|
label="Prompt", |
|
value="The video depicts a winding mountain road covered in snow, with a single vehicle traveling along it. The road is flanked by steep, rocky cliffs and sparse vegetation. The landscape is characterized by rugged terrain and a river visible in the distance. The scene captures the solitude and beauty of a winter drive through a mountainous region.", |
|
lines=4 |
|
) |
|
negative_prompt = gr.Textbox( |
|
label="Negative Prompt", |
|
value="worst quality, inconsistent motion, blurry, jittery, distorted", |
|
lines=2 |
|
) |
|
|
|
with gr.Row(): |
|
expected_height = gr.Slider( |
|
label="Output Height", |
|
minimum=256, |
|
maximum=1024, |
|
step=64, |
|
value=704 |
|
) |
|
expected_width = gr.Slider( |
|
label="Output Width", |
|
minimum=256, |
|
maximum=1024, |
|
step=64, |
|
value=512 |
|
) |
|
|
|
with gr.Row(): |
|
downscale_factor = gr.Slider( |
|
label="Initial Downscale Factor", |
|
minimum=0.3, |
|
maximum=0.9, |
|
step=0.05, |
|
value=2/3 |
|
) |
|
num_frames = gr.Slider( |
|
label="Number of Frames", |
|
minimum=24, |
|
maximum=240, |
|
step=1, |
|
value=121 |
|
) |
|
|
|
with gr.Row(): |
|
num_inference_steps = gr.Slider( |
|
label="Inference Steps", |
|
minimum=10, |
|
maximum=50, |
|
step=1, |
|
value=30 |
|
) |
|
denoise_strength = gr.Slider( |
|
label="Denoise Strength", |
|
minimum=0.1, |
|
maximum=0.9, |
|
step=0.05, |
|
value=0.4 |
|
) |
|
seed = gr.Number( |
|
label="Seed", |
|
value=0, |
|
precision=0 |
|
) |
|
|
|
submit_btn = gr.Button("Generate Video", variant="primary") |
|
|
|
with gr.Column(): |
|
output_video = gr.Video(label="Generated Video") |
|
|
|
submit_btn.click( |
|
fn=generate_video, |
|
inputs=[ |
|
prompt, |
|
negative_prompt, |
|
expected_height, |
|
expected_width, |
|
downscale_factor, |
|
num_frames, |
|
num_inference_steps, |
|
denoise_strength, |
|
seed |
|
], |
|
outputs=output_video |
|
) |
|
|
|
if __name__ == "__main__": |
|
demo.launch() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|