Spaces:
Running
on
Zero
Running
on
Zero
Upload folder using huggingface_hub
Browse files- demo.py +25 -33
- requirements.txt +2 -1
- sample_videos/t2v-temp.mp4 +0 -0
demo.py
CHANGED
|
@@ -12,7 +12,7 @@ from diffusers.schedulers import (DDIMScheduler, DDPMScheduler, PNDMScheduler,
|
|
| 12 |
from diffusers.schedulers.scheduling_dpmsolver_singlestep import DPMSolverSinglestepScheduler
|
| 13 |
from diffusers.models import AutoencoderKL, AutoencoderKLTemporalDecoder
|
| 14 |
from omegaconf import OmegaConf
|
| 15 |
-
from transformers import T5EncoderModel, T5Tokenizer
|
| 16 |
|
| 17 |
import os, sys
|
| 18 |
sys.path.append(os.path.split(sys.path[0])[0])
|
|
@@ -38,7 +38,11 @@ if args.enable_vae_temporal_decoder:
|
|
| 38 |
else:
|
| 39 |
vae = AutoencoderKL.from_pretrained(args.pretrained_model_path, subfolder="vae", torch_dtype=torch.float16).to(device)
|
| 40 |
tokenizer = T5Tokenizer.from_pretrained(args.pretrained_model_path, subfolder="tokenizer")
|
| 41 |
-
text_encoder = T5EncoderModel.from_pretrained(args.pretrained_model_path,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
|
| 43 |
# set eval mode
|
| 44 |
transformer_model.eval()
|
|
@@ -120,16 +124,28 @@ def gen_video(text_input, sample_method, scfg_scale, seed, height, width, video_
|
|
| 120 |
beta_end=args.beta_end,
|
| 121 |
beta_schedule=args.beta_schedule,
|
| 122 |
variance_type=args.variance_type)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
|
| 124 |
|
| 125 |
videogen_pipeline = LattePipeline(vae=vae,
|
| 126 |
-
text_encoder=text_encoder,
|
|
|
|
| 127 |
tokenizer=tokenizer,
|
| 128 |
scheduler=scheduler,
|
| 129 |
transformer=transformer_model).to(device)
|
| 130 |
# videogen_pipeline.enable_xformers_memory_efficient_attention()
|
| 131 |
|
| 132 |
-
videos = videogen_pipeline(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 133 |
video_length=video_length,
|
| 134 |
height=height,
|
| 135 |
width=width,
|
|
@@ -185,26 +201,12 @@ with gr.Blocks() as demo:
|
|
| 185 |
with gr.Column(visible=True) as input_raws:
|
| 186 |
with gr.Row():
|
| 187 |
with gr.Column(scale=1.0):
|
| 188 |
-
# text_input = gr.Textbox(show_label=True, interactive=True, label="Text prompt").style(container=False)
|
| 189 |
text_input = gr.Textbox(show_label=True, interactive=True, label="Prompt")
|
| 190 |
-
|
| 191 |
-
# with gr.Column(scale=0.5):
|
| 192 |
-
# image_input = gr.Image(show_label=True, interactive=True, label="Reference image").style(container=False)
|
| 193 |
-
# with gr.Column(scale=0.5):
|
| 194 |
-
# preframe_input = gr.Image(show_label=True, interactive=True, label="First frame").style(container=False)
|
| 195 |
with gr.Row():
|
| 196 |
with gr.Column(scale=0.5):
|
| 197 |
sample_method = gr.Dropdown(choices=["DDIM", "EulerDiscrete", "PNDM"], label="Sample Method", value="DDIM")
|
| 198 |
-
|
| 199 |
-
# with gr.Column(scale=1.0):
|
| 200 |
-
# video_length = gr.Slider(
|
| 201 |
-
# minimum=1,
|
| 202 |
-
# maximum=24,
|
| 203 |
-
# value=1,
|
| 204 |
-
# step=1,
|
| 205 |
-
# interactive=True,
|
| 206 |
-
# label="Video Length (1 for T2I and 16 for T2V)",
|
| 207 |
-
# )
|
| 208 |
with gr.Column(scale=0.5):
|
| 209 |
video_length = gr.Dropdown(choices=[1, 16], label="Video Length (1 for T2I and 16 for T2V)", value=16)
|
| 210 |
with gr.Row():
|
|
@@ -260,21 +262,11 @@ with gr.Blocks() as demo:
|
|
| 260 |
|
| 261 |
|
| 262 |
with gr.Column(scale=0.6, visible=True) as video_upload:
|
| 263 |
-
# with gr.Column(visible=True) as video_upload:
|
| 264 |
output = gr.Video(interactive=False, include_audio=True, elem_id="输出的视频") #.style(height=360)
|
| 265 |
-
# with gr.Column(elem_id="image", scale=0.5) as img_part:
|
| 266 |
-
# with gr.Tab("Video", elem_id='video_tab'):
|
| 267 |
-
|
| 268 |
-
# with gr.Tab("Image", elem_id='image_tab'):
|
| 269 |
-
# up_image = gr.Image(type="pil", interactive=True, elem_id="image_upload").style(height=360)
|
| 270 |
-
# upload_button = gr.Button(value="Upload & Start Chat", interactive=True, variant="primary")
|
| 271 |
-
# clear = gr.Button("Restart")
|
| 272 |
|
| 273 |
with gr.Row():
|
| 274 |
with gr.Column(scale=1.0, min_width=0):
|
| 275 |
-
run = gr.Button("
|
| 276 |
-
# with gr.Column(scale=0.5, min_width=0):
|
| 277 |
-
# clear = gr.Button("🔄Clear️")
|
| 278 |
|
| 279 |
EXAMPLES = [
|
| 280 |
["3D animation of a small, round, fluffy creature with big, expressive eyes explores a vibrant, enchanted forest. The creature, a whimsical blend of a rabbit and a squirrel, has soft blue fur and a bushy, striped tail. It hops along a sparkling stream, its eyes wide with wonder. The forest is alive with magical elements: flowers that glow and change colors, trees with leaves in shades of purple and silver, and small floating lights that resemble fireflies. The creature stops to interact playfully with a group of tiny, fairy-like beings dancing around a mushroom ring. The creature looks up in awe at a large, glowing tree that seems to be the heart of the forest.", "DDIM", 7.5, 100, 512, 512, 16, 50],
|
|
@@ -291,8 +283,8 @@ with gr.Blocks() as demo:
|
|
| 291 |
fn = gen_video,
|
| 292 |
inputs=[text_input, sample_method, scfg_scale, seed, height, width, video_length, diffusion_step],
|
| 293 |
outputs=[output],
|
| 294 |
-
|
| 295 |
-
cache_examples="lazy",
|
| 296 |
)
|
| 297 |
|
| 298 |
run.click(gen_video, [text_input, sample_method, scfg_scale, seed, height, width, video_length, diffusion_step], [output])
|
|
|
|
| 12 |
from diffusers.schedulers.scheduling_dpmsolver_singlestep import DPMSolverSinglestepScheduler
|
| 13 |
from diffusers.models import AutoencoderKL, AutoencoderKLTemporalDecoder
|
| 14 |
from omegaconf import OmegaConf
|
| 15 |
+
from transformers import T5EncoderModel, T5Tokenizer, BitsAndBytesConfig
|
| 16 |
|
| 17 |
import os, sys
|
| 18 |
sys.path.append(os.path.split(sys.path[0])[0])
|
|
|
|
| 38 |
else:
|
| 39 |
vae = AutoencoderKL.from_pretrained(args.pretrained_model_path, subfolder="vae", torch_dtype=torch.float16).to(device)
|
| 40 |
tokenizer = T5Tokenizer.from_pretrained(args.pretrained_model_path, subfolder="tokenizer")
|
| 41 |
+
text_encoder = T5EncoderModel.from_pretrained(args.pretrained_model_path,
|
| 42 |
+
subfolder="text_encoder",
|
| 43 |
+
quantization_config=BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16),
|
| 44 |
+
device_map="auto",
|
| 45 |
+
)
|
| 46 |
|
| 47 |
# set eval mode
|
| 48 |
transformer_model.eval()
|
|
|
|
| 124 |
beta_end=args.beta_end,
|
| 125 |
beta_schedule=args.beta_schedule,
|
| 126 |
variance_type=args.variance_type)
|
| 127 |
+
|
| 128 |
+
pipe_tmp = LattePipeline.from_pretrained(
|
| 129 |
+
args.pretrained_model_path,
|
| 130 |
+
transformer=None,
|
| 131 |
+
text_encoder=text_encoder,
|
| 132 |
+
device_map="balanced",)
|
| 133 |
+
prompt_embeds, negative_prompt_embeds = pipe_tmp.encode_prompt(text_input, negative_prompt="")
|
| 134 |
|
| 135 |
|
| 136 |
videogen_pipeline = LattePipeline(vae=vae,
|
| 137 |
+
# text_encoder=text_encoder,
|
| 138 |
+
text_encoder=None,
|
| 139 |
tokenizer=tokenizer,
|
| 140 |
scheduler=scheduler,
|
| 141 |
transformer=transformer_model).to(device)
|
| 142 |
# videogen_pipeline.enable_xformers_memory_efficient_attention()
|
| 143 |
|
| 144 |
+
videos = videogen_pipeline(
|
| 145 |
+
# text_input,
|
| 146 |
+
prompt_embeds=prompt_embeds,
|
| 147 |
+
negative_prompt=None,
|
| 148 |
+
negative_prompt_embeds=negative_prompt_embeds,
|
| 149 |
video_length=video_length,
|
| 150 |
height=height,
|
| 151 |
width=width,
|
|
|
|
| 201 |
with gr.Column(visible=True) as input_raws:
|
| 202 |
with gr.Row():
|
| 203 |
with gr.Column(scale=1.0):
|
|
|
|
| 204 |
text_input = gr.Textbox(show_label=True, interactive=True, label="Prompt")
|
| 205 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
| 206 |
with gr.Row():
|
| 207 |
with gr.Column(scale=0.5):
|
| 208 |
sample_method = gr.Dropdown(choices=["DDIM", "EulerDiscrete", "PNDM"], label="Sample Method", value="DDIM")
|
| 209 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 210 |
with gr.Column(scale=0.5):
|
| 211 |
video_length = gr.Dropdown(choices=[1, 16], label="Video Length (1 for T2I and 16 for T2V)", value=16)
|
| 212 |
with gr.Row():
|
|
|
|
| 262 |
|
| 263 |
|
| 264 |
with gr.Column(scale=0.6, visible=True) as video_upload:
|
|
|
|
| 265 |
output = gr.Video(interactive=False, include_audio=True, elem_id="输出的视频") #.style(height=360)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 266 |
|
| 267 |
with gr.Row():
|
| 268 |
with gr.Column(scale=1.0, min_width=0):
|
| 269 |
+
run = gr.Button(value="Generate", variant='primary')
|
|
|
|
|
|
|
| 270 |
|
| 271 |
EXAMPLES = [
|
| 272 |
["3D animation of a small, round, fluffy creature with big, expressive eyes explores a vibrant, enchanted forest. The creature, a whimsical blend of a rabbit and a squirrel, has soft blue fur and a bushy, striped tail. It hops along a sparkling stream, its eyes wide with wonder. The forest is alive with magical elements: flowers that glow and change colors, trees with leaves in shades of purple and silver, and small floating lights that resemble fireflies. The creature stops to interact playfully with a group of tiny, fairy-like beings dancing around a mushroom ring. The creature looks up in awe at a large, glowing tree that seems to be the heart of the forest.", "DDIM", 7.5, 100, 512, 512, 16, 50],
|
|
|
|
| 283 |
fn = gen_video,
|
| 284 |
inputs=[text_input, sample_method, scfg_scale, seed, height, width, video_length, diffusion_step],
|
| 285 |
outputs=[output],
|
| 286 |
+
cache_examples=True,
|
| 287 |
+
# cache_examples="lazy",
|
| 288 |
)
|
| 289 |
|
| 290 |
run.click(gen_video, [text_input, sample_method, scfg_scale, seed, height, width, video_length, diffusion_step], [output])
|
requirements.txt
CHANGED
|
@@ -17,4 +17,5 @@ beautifulsoup4
|
|
| 17 |
ftfy
|
| 18 |
omegaconf
|
| 19 |
spaces
|
| 20 |
-
imageio-ffmpeg
|
|
|
|
|
|
| 17 |
ftfy
|
| 18 |
omegaconf
|
| 19 |
spaces
|
| 20 |
+
imageio-ffmpeg
|
| 21 |
+
bitsandbytes
|
sample_videos/t2v-temp.mp4
CHANGED
|
Binary files a/sample_videos/t2v-temp.mp4 and b/sample_videos/t2v-temp.mp4 differ
|
|
|