README.md: fix demo code.

#3
by bweisslt - opened
Files changed (1) hide show
  1. README.md +21 -11
README.md CHANGED
@@ -124,7 +124,6 @@ pip install -U git+https://github.com/huggingface/diffusers
124
  Now, you can run the examples below (note that the upsampling stage is optional but reccomeneded):
125
 
126
  ### text-to-video:
127
- ```
128
  ```py
129
  import torch
130
  from diffusers import LTXConditionPipeline, LTXLatentUpsamplePipeline
@@ -137,6 +136,11 @@ pipe.to("cuda")
137
  pipe_upsample.to("cuda")
138
  pipe.vae.enable_tiling()
139
 
 
 
 
 
 
140
  prompt = "The video depicts a winding mountain road covered in snow, with a single vehicle traveling along it. The road is flanked by steep, rocky cliffs and sparse vegetation. The landscape is characterized by rugged terrain and a river visible in the distance. The scene captures the solitude and beauty of a winter drive through a mountainous region."
141
  negative_prompt = "worst quality, inconsistent motion, blurry, jittery, distorted"
142
  expected_height, expected_width = 704, 512
@@ -145,6 +149,7 @@ num_frames = 121
145
 
146
  # Part 1. Generate video at smaller resolution
147
  downscaled_height, downscaled_width = int(expected_height * downscale_factor), int(expected_width * downscale_factor)
 
148
  latents = pipe(
149
  conditions=None,
150
  prompt=prompt,
@@ -154,7 +159,7 @@ latents = pipe(
154
  num_frames=num_frames,
155
  num_inference_steps=7,
156
  decode_timestep = 0.05,
157
- guidnace_scale=1.0,
158
  decode_noise_scale = 0.025,
159
  generator=torch.Generator().manual_seed(0),
160
  output_type="latent",
@@ -178,7 +183,7 @@ video = pipe(
178
  num_inference_steps=10,
179
  latents=upscaled_latents,
180
  decode_timestep = 0.05,
181
- guidnace_scale=1.0,
182
  decode_noise_scale = 0.025,
183
  image_cond_noise_scale=0.025,
184
  generator=torch.Generator().manual_seed(0),
@@ -205,6 +210,11 @@ pipe.to("cuda")
205
  pipe_upsample.to("cuda")
206
  pipe.vae.enable_tiling()
207
 
 
 
 
 
 
208
  image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/penguin.png")
209
  video = [image]
210
  condition1 = LTXVideoCondition(video=video, frame_index=0)
@@ -226,7 +236,7 @@ latents = pipe(
226
  height=downscaled_height,
227
  num_frames=num_frames,
228
  num_inference_steps=7,
229
- guidnace_scale=1.0,
230
  decode_timestep = 0.05,
231
  decode_noise_scale = 0.025,
232
  generator=torch.Generator().manual_seed(0),
@@ -250,7 +260,7 @@ video = pipe(
250
  num_frames=num_frames,
251
  denoise_strength=0.3, # Effectively, 4 inference steps out of 10
252
  num_inference_steps=10,
253
- guidnace_scale=1.0,
254
  latents=upscaled_latents,
255
  decode_timestep = 0.05,
256
  decode_noise_scale = 0.025,
@@ -263,7 +273,6 @@ video = pipe(
263
  video = [frame.resize((expected_width, expected_height)) for frame in video]
264
 
265
  export_to_video(video, "output.mp4", fps=24)
266
-
267
  ```
268
 
269
  ### For video-to-video:
@@ -281,9 +290,10 @@ pipe_upsample.to("cuda")
281
  pipe.vae.enable_tiling()
282
 
283
  def round_to_nearest_resolution_acceptable_by_vae(height, width):
284
- height = height - (height % pipe.vae_temporal_compression_ratio)
285
- width = width - (width % pipe.vae_temporal_compression_ratio)
286
  return height, width
 
287
  video = load_video(
288
  "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cosmos/cosmos-video2world-input-vid.mp4"
289
  )[:21] # Use only the first 21 frames as conditioning
@@ -306,7 +316,7 @@ latents = pipe(
306
  height=downscaled_height,
307
  num_frames=num_frames,
308
  num_inference_steps=7,
309
- guidnace_scale=1.0,
310
  decode_timestep = 0.05,
311
  decode_noise_scale = 0.025,
312
  generator=torch.Generator().manual_seed(0),
@@ -331,7 +341,7 @@ video = pipe(
331
  num_frames=num_frames,
332
  denoise_strength=0.3, # Effectively, 4 inference steps out of 10
333
  num_inference_steps=10,
334
- guidnace_scale=1.0,
335
  latents=upscaled_latents,
336
  decode_timestep = 0.05,
337
  decode_noise_scale = 0.025,
@@ -344,8 +354,8 @@ video = pipe(
344
  video = [frame.resize((expected_width, expected_height)) for frame in video]
345
 
346
  export_to_video(video, "output.mp4", fps=24)
347
-
348
  ```
 
349
  To learn more, check out the [official documentation](https://huggingface.co/docs/diffusers/main/en/api/pipelines/ltx_video).
350
 
351
  Diffusers also supports directly loading from the original LTX checkpoints using the `from_single_file()` method. Check out [this section](https://huggingface.co/docs/diffusers/main/en/api/pipelines/ltx_video#loading-single-files) to learn more.
 
124
  Now, you can run the examples below (note that the upsampling stage is optional but reccomeneded):
125
 
126
  ### text-to-video:
 
127
  ```py
128
  import torch
129
  from diffusers import LTXConditionPipeline, LTXLatentUpsamplePipeline
 
136
  pipe_upsample.to("cuda")
137
  pipe.vae.enable_tiling()
138
 
139
+ def round_to_nearest_resolution_acceptable_by_vae(height, width):
140
+ height = height - (height % pipe.vae_spatial_compression_ratio)
141
+ width = width - (width % pipe.vae_spatial_compression_ratio)
142
+ return height, width
143
+
144
  prompt = "The video depicts a winding mountain road covered in snow, with a single vehicle traveling along it. The road is flanked by steep, rocky cliffs and sparse vegetation. The landscape is characterized by rugged terrain and a river visible in the distance. The scene captures the solitude and beauty of a winter drive through a mountainous region."
145
  negative_prompt = "worst quality, inconsistent motion, blurry, jittery, distorted"
146
  expected_height, expected_width = 704, 512
 
149
 
150
  # Part 1. Generate video at smaller resolution
151
  downscaled_height, downscaled_width = int(expected_height * downscale_factor), int(expected_width * downscale_factor)
152
+ downscaled_height, downscaled_width = round_to_nearest_resolution_acceptable_by_vae(downscaled_height, downscaled_width)
153
  latents = pipe(
154
  conditions=None,
155
  prompt=prompt,
 
159
  num_frames=num_frames,
160
  num_inference_steps=7,
161
  decode_timestep = 0.05,
162
+ guidance_scale=1.0,
163
  decode_noise_scale = 0.025,
164
  generator=torch.Generator().manual_seed(0),
165
  output_type="latent",
 
183
  num_inference_steps=10,
184
  latents=upscaled_latents,
185
  decode_timestep = 0.05,
186
+ guidance_scale=1.0,
187
  decode_noise_scale = 0.025,
188
  image_cond_noise_scale=0.025,
189
  generator=torch.Generator().manual_seed(0),
 
210
  pipe_upsample.to("cuda")
211
  pipe.vae.enable_tiling()
212
 
213
+ def round_to_nearest_resolution_acceptable_by_vae(height, width):
214
+ height = height - (height % pipe.vae_spatial_compression_ratio)
215
+ width = width - (width % pipe.vae_spatial_compression_ratio)
216
+ return height, width
217
+
218
  image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/penguin.png")
219
  video = [image]
220
  condition1 = LTXVideoCondition(video=video, frame_index=0)
 
236
  height=downscaled_height,
237
  num_frames=num_frames,
238
  num_inference_steps=7,
239
+ guidance_scale=1.0,
240
  decode_timestep = 0.05,
241
  decode_noise_scale = 0.025,
242
  generator=torch.Generator().manual_seed(0),
 
260
  num_frames=num_frames,
261
  denoise_strength=0.3, # Effectively, 4 inference steps out of 10
262
  num_inference_steps=10,
263
+ guidance_scale=1.0,
264
  latents=upscaled_latents,
265
  decode_timestep = 0.05,
266
  decode_noise_scale = 0.025,
 
273
  video = [frame.resize((expected_width, expected_height)) for frame in video]
274
 
275
  export_to_video(video, "output.mp4", fps=24)
 
276
  ```
277
 
278
  ### For video-to-video:
 
290
  pipe.vae.enable_tiling()
291
 
292
  def round_to_nearest_resolution_acceptable_by_vae(height, width):
293
+ height = height - (height % pipe.vae_spatial_compression_ratio)
294
+ width = width - (width % pipe.vae_spatial_compression_ratio)
295
  return height, width
296
+
297
  video = load_video(
298
  "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cosmos/cosmos-video2world-input-vid.mp4"
299
  )[:21] # Use only the first 21 frames as conditioning
 
316
  height=downscaled_height,
317
  num_frames=num_frames,
318
  num_inference_steps=7,
319
+ guidance_scale=1.0,
320
  decode_timestep = 0.05,
321
  decode_noise_scale = 0.025,
322
  generator=torch.Generator().manual_seed(0),
 
341
  num_frames=num_frames,
342
  denoise_strength=0.3, # Effectively, 4 inference steps out of 10
343
  num_inference_steps=10,
344
+ guidance_scale=1.0,
345
  latents=upscaled_latents,
346
  decode_timestep = 0.05,
347
  decode_noise_scale = 0.025,
 
354
  video = [frame.resize((expected_width, expected_height)) for frame in video]
355
 
356
  export_to_video(video, "output.mp4", fps=24)
 
357
  ```
358
+
359
  To learn more, check out the [official documentation](https://huggingface.co/docs/diffusers/main/en/api/pipelines/ltx_video).
360
 
361
  Diffusers also supports directly loading from the original LTX checkpoints using the `from_single_file()` method. Check out [this section](https://huggingface.co/docs/diffusers/main/en/api/pipelines/ltx_video#loading-single-files) to learn more.