Spaces:

maxin-cn
/

Latte-1

Running on Zero

App Files Files Community

maxin-cn commited on Aug 6, 2024

Commit

b1e71c1

verified ·

1 Parent(s): 4051d56

Upload folder using huggingface_hub

Browse files

Files changed (3) hide show

demo.py +25 -33
requirements.txt +2 -1
sample_videos/t2v-temp.mp4 +0 -0

demo.py CHANGED Viewed

@@ -12,7 +12,7 @@ from diffusers.schedulers import (DDIMScheduler, DDPMScheduler, PNDMScheduler,
 from diffusers.schedulers.scheduling_dpmsolver_singlestep import DPMSolverSinglestepScheduler
 from diffusers.models import AutoencoderKL, AutoencoderKLTemporalDecoder
 from omegaconf import OmegaConf
-from transformers import T5EncoderModel, T5Tokenizer
 import os, sys
 sys.path.append(os.path.split(sys.path[0])[0])
@@ -38,7 +38,11 @@ if args.enable_vae_temporal_decoder:
 else:
     vae = AutoencoderKL.from_pretrained(args.pretrained_model_path, subfolder="vae", torch_dtype=torch.float16).to(device)
 tokenizer = T5Tokenizer.from_pretrained(args.pretrained_model_path, subfolder="tokenizer")
-text_encoder = T5EncoderModel.from_pretrained(args.pretrained_model_path, subfolder="text_encoder", torch_dtype=torch.float16).to(device)
 # set eval mode
 transformer_model.eval()
@@ -120,16 +124,28 @@ def gen_video(text_input, sample_method, scfg_scale, seed, height, width, video_
                                                     beta_end=args.beta_end,
                                                     beta_schedule=args.beta_schedule,
                                                     variance_type=args.variance_type)
     videogen_pipeline = LattePipeline(vae=vae,
-                                    text_encoder=text_encoder,
                                     tokenizer=tokenizer,
                                     scheduler=scheduler,
                                     transformer=transformer_model).to(device)
     # videogen_pipeline.enable_xformers_memory_efficient_attention()
-    videos = videogen_pipeline(text_input,
                                 video_length=video_length,
                                 height=height,
                                 width=width,
@@ -185,26 +201,12 @@ with gr.Blocks() as demo:
         with gr.Column(visible=True) as input_raws:
             with gr.Row():
                 with gr.Column(scale=1.0):
-                    # text_input = gr.Textbox(show_label=True, interactive=True, label="Text prompt").style(container=False)
                     text_input = gr.Textbox(show_label=True, interactive=True, label="Prompt")
-            # with gr.Row():
-            #     with gr.Column(scale=0.5):
-            #         image_input = gr.Image(show_label=True, interactive=True, label="Reference image").style(container=False)
-            #     with gr.Column(scale=0.5):
-            #         preframe_input = gr.Image(show_label=True, interactive=True, label="First frame").style(container=False)
             with gr.Row():
                 with gr.Column(scale=0.5):
                     sample_method = gr.Dropdown(choices=["DDIM", "EulerDiscrete", "PNDM"], label="Sample Method", value="DDIM")
-            # with gr.Row():
-            #     with gr.Column(scale=1.0):
-            #         video_length = gr.Slider(
-            #             minimum=1,
-            #             maximum=24,
-            #             value=1,
-            #             step=1,
-            #             interactive=True,
-            #             label="Video Length (1 for T2I and 16 for T2V)",
-            #         )
                 with gr.Column(scale=0.5):
                     video_length = gr.Dropdown(choices=[1, 16], label="Video Length (1 for T2I and 16 for T2V)", value=16)
             with gr.Row():
@@ -260,21 +262,11 @@ with gr.Blocks() as demo:
         with gr.Column(scale=0.6, visible=True) as video_upload:
-        # with gr.Column(visible=True) as video_upload:
             output = gr.Video(interactive=False, include_audio=True, elem_id="输出的视频") #.style(height=360)
-            # with gr.Column(elem_id="image", scale=0.5) as img_part:
-            #     with gr.Tab("Video", elem_id='video_tab'):
-            #     with gr.Tab("Image", elem_id='image_tab'):
-            #         up_image = gr.Image(type="pil", interactive=True, elem_id="image_upload").style(height=360)
-            # upload_button = gr.Button(value="Upload & Start Chat", interactive=True, variant="primary")
-            # clear = gr.Button("Restart")
             with gr.Row():
                 with gr.Column(scale=1.0, min_width=0):
-                    run = gr.Button("💭Run")
-                # with gr.Column(scale=0.5, min_width=0):
-                #     clear = gr.Button("🔄Clear️")
     EXAMPLES = [
         ["3D animation of a small, round, fluffy creature with big, expressive eyes explores a vibrant, enchanted forest. The creature, a whimsical blend of a rabbit and a squirrel, has soft blue fur and a bushy, striped tail. It hops along a sparkling stream, its eyes wide with wonder. The forest is alive with magical elements: flowers that glow and change colors, trees with leaves in shades of purple and silver, and small floating lights that resemble fireflies. The creature stops to interact playfully with a group of tiny, fairy-like beings dancing around a mushroom ring. The creature looks up in awe at a large, glowing tree that seems to be the heart of the forest.",  "DDIM", 7.5, 100, 512, 512, 16, 50],
@@ -291,8 +283,8 @@ with gr.Blocks() as demo:
         fn = gen_video,
         inputs=[text_input, sample_method, scfg_scale, seed, height, width, video_length, diffusion_step],
         outputs=[output],
-        # cache_examples=True,
-        cache_examples="lazy",
     )
     run.click(gen_video, [text_input, sample_method, scfg_scale, seed, height, width, video_length, diffusion_step], [output])

 from diffusers.schedulers.scheduling_dpmsolver_singlestep import DPMSolverSinglestepScheduler
 from diffusers.models import AutoencoderKL, AutoencoderKLTemporalDecoder
 from omegaconf import OmegaConf
+from transformers import T5EncoderModel, T5Tokenizer, BitsAndBytesConfig
 import os, sys
 sys.path.append(os.path.split(sys.path[0])[0])
 else:
     vae = AutoencoderKL.from_pretrained(args.pretrained_model_path, subfolder="vae", torch_dtype=torch.float16).to(device)
 tokenizer = T5Tokenizer.from_pretrained(args.pretrained_model_path, subfolder="tokenizer")
+text_encoder = T5EncoderModel.from_pretrained(args.pretrained_model_path,
+                                              subfolder="text_encoder",
+                                              quantization_config=BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16),
+                                              device_map="auto",
+                                              )
 # set eval mode
 transformer_model.eval()
                                                     beta_end=args.beta_end,
                                                     beta_schedule=args.beta_schedule,
                                                     variance_type=args.variance_type)
+    pipe_tmp = LattePipeline.from_pretrained(
+                                    args.pretrained_model_path,
+                                    transformer=None,
+                                    text_encoder=text_encoder,
+                                    device_map="balanced",)
+    prompt_embeds, negative_prompt_embeds = pipe_tmp.encode_prompt(text_input, negative_prompt="")
     videogen_pipeline = LattePipeline(vae=vae,
+                                    # text_encoder=text_encoder,
+                                    text_encoder=None,
                                     tokenizer=tokenizer,
                                     scheduler=scheduler,
                                     transformer=transformer_model).to(device)
     # videogen_pipeline.enable_xformers_memory_efficient_attention()
+    videos = videogen_pipeline(
+                                # text_input,
+                                prompt_embeds=prompt_embeds,
+                                negative_prompt=None,
+                                negative_prompt_embeds=negative_prompt_embeds,
                                 video_length=video_length,
                                 height=height,
                                 width=width,
         with gr.Column(visible=True) as input_raws:
             with gr.Row():
                 with gr.Column(scale=1.0):
                     text_input = gr.Textbox(show_label=True, interactive=True, label="Prompt")
             with gr.Row():
                 with gr.Column(scale=0.5):
                     sample_method = gr.Dropdown(choices=["DDIM", "EulerDiscrete", "PNDM"], label="Sample Method", value="DDIM")
                 with gr.Column(scale=0.5):
                     video_length = gr.Dropdown(choices=[1, 16], label="Video Length (1 for T2I and 16 for T2V)", value=16)
             with gr.Row():
         with gr.Column(scale=0.6, visible=True) as video_upload:
             output = gr.Video(interactive=False, include_audio=True, elem_id="输出的视频") #.style(height=360)
             with gr.Row():
                 with gr.Column(scale=1.0, min_width=0):
+                    run = gr.Button(value="Generate", variant='primary')
     EXAMPLES = [
         ["3D animation of a small, round, fluffy creature with big, expressive eyes explores a vibrant, enchanted forest. The creature, a whimsical blend of a rabbit and a squirrel, has soft blue fur and a bushy, striped tail. It hops along a sparkling stream, its eyes wide with wonder. The forest is alive with magical elements: flowers that glow and change colors, trees with leaves in shades of purple and silver, and small floating lights that resemble fireflies. The creature stops to interact playfully with a group of tiny, fairy-like beings dancing around a mushroom ring. The creature looks up in awe at a large, glowing tree that seems to be the heart of the forest.",  "DDIM", 7.5, 100, 512, 512, 16, 50],
         fn = gen_video,
         inputs=[text_input, sample_method, scfg_scale, seed, height, width, video_length, diffusion_step],
         outputs=[output],
+        cache_examples=True,
+        # cache_examples="lazy",
     )
     run.click(gen_video, [text_input, sample_method, scfg_scale, seed, height, width, video_length, diffusion_step], [output])

requirements.txt CHANGED Viewed

@@ -17,4 +17,5 @@ beautifulsoup4
 ftfy
 omegaconf
 spaces
-imageio-ffmpeg

 ftfy
 omegaconf
 spaces
+imageio-ffmpeg
+bitsandbytes

sample_videos/t2v-temp.mp4 CHANGED Viewed

Binary files a/sample_videos/t2v-temp.mp4 and b/sample_videos/t2v-temp.mp4 differ