Spaces:

DEMONMO
/

het

Runtime error

App Files Files Community

het / app.py

DEMONMO

Rename video_generator.py to app.py

e82f1c2 verified 21 days ago

raw

history blame contribute delete

8.36 kB

	import torch
	from diffusers import LTXConditionPipeline, LTXLatentUpsamplePipeline
	from diffusers.pipelines.ltx.pipeline_ltx_condition import LTXVideoCondition
	from diffusers.utils import export_to_video

	pipe = LTXConditionPipeline.from_pretrained("Lightricks/LTX-Video-0.9.7-dev", torch_dtype=torch.bfloat16)
	pipe_upsample = LTXLatentUpsamplePipeline.from_pretrained("Lightricks/ltxv-spatial-upscaler-0.9.7", vae=pipe.vae, torch_dtype=torch.bfloat16)
	pipe.to("cuda")
	pipe_upsample.to("cuda")
	pipe.vae.enable_tiling()

	prompt = "The video depicts a winding mountain road covered in snow, with a single vehicle traveling along it. The road is flanked by steep, rocky cliffs and sparse vegetation. The landscape is characterized by rugged terrain and a river visible in the distance. The scene captures the solitude and beauty of a winter drive through a mountainous region."
	negative_prompt = "worst quality, inconsistent motion, blurry, jittery, distorted"
	expected_height, expected_width = 704, 512
	downscale_factor = 2 / 3
	num_frames = 121

	# Part 1. Generate video at smaller resolution
	downscaled_height, downscaled_width = int(expected_height * downscale_factor), int(expected_width * downscale_factor)
	latents = pipe(
	conditions=None,
	prompt=prompt,
	negative_prompt=negative_prompt,
	width=downscaled_width,
	height=downscaled_height,
	num_frames=num_frames,
	num_inference_steps=30,
	generator=torch.Generator().manual_seed(0),
	output_type="latent",
	).frames

	# Part 2. Upscale generated video using latent upsampler with fewer inference steps
	# The available latent upsampler upscales the height/width by 2x
	upscaled_height, upscaled_width = downscaled_height * 2, downscaled_width * 2
	upscaled_latents = pipe_upsample(
	latents=latents,
	output_type="latent"
	).frames

	# Part 3. Denoise the upscaled video with few steps to improve texture (optional, but recommended)
	video = pipe(
	prompt=prompt,
	negative_prompt=negative_prompt,
	width=upscaled_width,
	height=upscaled_height,
	num_frames=num_frames,
	denoise_strength=0.4, # Effectively, 4 inference steps out of 10
	num_inference_steps=10,
	latents=upscaled_latents,
	decode_timestep=0.05,
	image_cond_noise_scale=0.025,
	generator=torch.Generator().manual_seed(0),
	output_type="pil",
	).frames[0]

	# Part 4. Downscale the video to the expected resolution
	video = [frame.resize((expected_width, expected_height)) for frame in video]

	export_to_video(video, "output.mp4", fps=24)
	import torch
	import gradio as gr
	from diffusers import LTXConditionPipeline, LTXLatentUpsamplePipeline
	from diffusers.pipelines.ltx.pipeline_ltx_condition import LTXVideoCondition
	from diffusers.utils import export_to_video

	def generate_video(
	prompt,
	negative_prompt,
	expected_height,
	expected_width,
	downscale_factor,
	num_frames,
	num_inference_steps,
	denoise_strength,
	seed,
	progress=gr.Progress()
	):
	# Initialize pipelines (move this outside the function for production)
	progress(0.1, desc="Loading models...")
	pipe = LTXConditionPipeline.from_pretrained("Lightricks/LTX-Video-0.9.7-dev", torch_dtype=torch.bfloat16)
	pipe_upsample = LTXLatentUpsamplePipeline.from_pretrained("Lightricks/ltxv-spatial-upscaler-0.9.7", vae=pipe.vae, torch_dtype=torch.bfloat16)
	pipe.to("cuda")
	pipe_upsample.to("cuda")
	pipe.vae.enable_tiling()

	# Part 1. Generate video at smaller resolution
	progress(0.2, desc="Generating initial video...")
	downscaled_height, downscaled_width = int(expected_height * downscale_factor), int(expected_width * downscale_factor)
	generator = torch.Generator().manual_seed(seed)

	latents = pipe(
	conditions=None,
	prompt=prompt,
	negative_prompt=negative_prompt,
	width=downscaled_width,
	height=downscaled_height,
	num_frames=num_frames,
	num_inference_steps=num_inference_steps,
	generator=generator,
	output_type="latent",
	).frames

	# Part 2. Upscale generated video
	progress(0.5, desc="Upscaling video...")
	upscaled_height, upscaled_width = downscaled_height * 2, downscaled_width * 2
	upscaled_latents = pipe_upsample(
	latents=latents,
	output_type="latent"
	).frames

	# Part 3. Denoise the upscaled video
	progress(0.7, desc="Refining video quality...")
	video = pipe(
	prompt=prompt,
	negative_prompt=negative_prompt,
	width=upscaled_width,
	height=upscaled_height,
	num_frames=num_frames,
	denoise_strength=denoise_strength,
	num_inference_steps=10,
	latents=upscaled_latents,
	decode_timestep=0.05,
	image_cond_noise_scale=0.025,
	generator=generator,
	output_type="pil",
	).frames[0]

	# Part 4. Downscale the video to the expected resolution
	progress(0.9, desc="Finalizing video...")
	video = [frame.resize((expected_width, expected_height)) for frame in video]

	# Save and return video
	output_path = "output.mp4"
	export_to_video(video, output_path, fps=24)

	return output_path

	# Create Gradio interface
	with gr.Blocks(title="LTX Video Generator") as demo:
	gr.Markdown("# LTX Video Generator")
	gr.Markdown("Generate videos from text prompts using Lightricks' LTX model")

	with gr.Row():
	with gr.Column():
	prompt = gr.Textbox(
	label="Prompt",
	value="The video depicts a winding mountain road covered in snow, with a single vehicle traveling along it. The road is flanked by steep, rocky cliffs and sparse vegetation. The landscape is characterized by rugged terrain and a river visible in the distance. The scene captures the solitude and beauty of a winter drive through a mountainous region.",
	lines=4
	)
	negative_prompt = gr.Textbox(
	label="Negative Prompt",
	value="worst quality, inconsistent motion, blurry, jittery, distorted",
	lines=2
	)

	with gr.Row():
	expected_height = gr.Slider(
	label="Output Height",
	minimum=256,
	maximum=1024,
	step=64,
	value=704
	)
	expected_width = gr.Slider(
	label="Output Width",
	minimum=256,
	maximum=1024,
	step=64,
	value=512
	)

	with gr.Row():
	downscale_factor = gr.Slider(
	label="Initial Downscale Factor",
	minimum=0.3,
	maximum=0.9,
	step=0.05,
	value=2/3
	)
	num_frames = gr.Slider(
	label="Number of Frames",
	minimum=24,
	maximum=240,
	step=1,
	value=121
	)

	with gr.Row():
	num_inference_steps = gr.Slider(
	label="Inference Steps",
	minimum=10,
	maximum=50,
	step=1,
	value=30
	)
	denoise_strength = gr.Slider(
	label="Denoise Strength",
	minimum=0.1,
	maximum=0.9,
	step=0.05,
	value=0.4
	)
	seed = gr.Number(
	label="Seed",
	value=0,
	precision=0
	)

	submit_btn = gr.Button("Generate Video", variant="primary")

	with gr.Column():
	output_video = gr.Video(label="Generated Video")

	submit_btn.click(
	fn=generate_video,
	inputs=[
	prompt,
	negative_prompt,
	expected_height,
	expected_width,
	downscale_factor,
	num_frames,
	num_inference_steps,
	denoise_strength,
	seed
	],
	outputs=output_video
	)

	if __name__ == "__main__":
	demo.launch()