This is an experimental checkpoint and its poor generalization is well-known.
Inference code:
from diffusers import CogVideoXTransformer3DModel, DiffusionPipeline
from diffusers.utils import export_to_video
import torch
pipeline = DiffusionPipeline.from_pretrained(
"THUDM/CogVideoX1.5-5b", torch_dtype=torch.bfloat16
).to("cuda")
pipeline.load_lora_weights("finetrainers/CogVideoX-1.5-crush-smol-v0", adapter_name="cogvideox-lora")
pipeline.set_adapters("cogvideox-lora", 0.9)
prompt = """PIKA_CRUSH A red toy car is being crushed by a large hydraulic press, which is flattening objects as if they were under a hydraulic press."""
negative_prompt = "inconsistent motion, blurry motion, worse quality, degenerate outputs, deformed outputs"
video = pipeline(
prompt=prompt,
negative_prompt=negative_prompt,
num_frames=81,
height=480,
width=768,
num_inference_steps=50
).frames[0]
export_to_video(video, "output.mp4", fps=25)