aidealab
/

AIdeaLab-VideoJP

Model card Files Files and versions

alfredplpl commited on Jan 8

Commit

a1d0d24

·

verified ·

1 Parent(s): 047ef40

Update README.md

Files changed (1) hide show

README.md +20 -15

README.md CHANGED Viewed

@@ -84,20 +84,6 @@ text_encoder = AutoModelForCausalLM.from_pretrained(
 )
 text_encoder=text_encoder.to(device)
-transformer = CogVideoXTransformer3DModel.from_pretrained(
-    "aidealab/commonvideo",
-    torch_dtype=torch_dtype
-)
-transformer=transformer.to(device)
-vae = AutoencoderKLCogVideoX.from_pretrained(
-    "THUDM/CogVideoX-2b",
-    subfolder="vae"
-)
-vae=vae.to(dtype=torch_dtype, device=device)
-vae.enable_slicing()
-vae.enable_tiling()
 text_inputs = tokenizer(
     prompt,
     padding="max_length",
@@ -122,6 +108,23 @@ null_text_input_ids = null_text_inputs.input_ids
 null_prompt_embeds = text_encoder(null_text_input_ids.to(device), output_hidden_states=True, attention_mask=null_text_inputs.attention_mask.to(device)).hidden_states[-1]
 null_prompt_embeds = null_prompt_embeds.to(dtype=torch_dtype, device=device)
 # euler discreate sampler with cfg
 z0 = torch.randn(shape, device=device)
 latents = z0.detach().clone().to(torch_dtype)
@@ -137,7 +140,9 @@ with torch.no_grad():
         pred = null_conditional.sample+cfg*(positive_conditional.sample-null_conditional.sample)
         latents = latents.detach().clone() + dt * pred.detach().clone()
-    # Free vram
     latents = latents / vae.config.scaling_factor
     latents = latents.permute(0, 2, 1, 3, 4) # [B, F, C, H, W]
     x=vae.decode(latents).sample

 )
 text_encoder=text_encoder.to(device)
 text_inputs = tokenizer(
     prompt,
     padding="max_length",
 null_prompt_embeds = text_encoder(null_text_input_ids.to(device), output_hidden_states=True, attention_mask=null_text_inputs.attention_mask.to(device)).hidden_states[-1]
 null_prompt_embeds = null_prompt_embeds.to(dtype=torch_dtype, device=device)
+# Free VRAM
+del text_encoder
+transformer = CogVideoXTransformer3DModel.from_pretrained(
+    "aidealab/commonvideo",
+    torch_dtype=torch_dtype
+)
+transformer=transformer.to(device)
+vae = AutoencoderKLCogVideoX.from_pretrained(
+    "THUDM/CogVideoX-2b",
+    subfolder="vae"
+)
+vae=vae.to(dtype=torch_dtype, device=device)
+vae.enable_slicing()
+vae.enable_tiling()
 # euler discreate sampler with cfg
 z0 = torch.randn(shape, device=device)
 latents = z0.detach().clone().to(torch_dtype)
         pred = null_conditional.sample+cfg*(positive_conditional.sample-null_conditional.sample)
         latents = latents.detach().clone() + dt * pred.detach().clone()
+    # Free VRAM
+    del transformer
     latents = latents / vae.config.scaling_factor
     latents = latents.permute(0, 2, 1, 3, 4) # [B, F, C, H, W]
     x=vae.decode(latents).sample