Spaces:

kiwhansong
/

diffusion-forcing-transformer

Running on Zero

App Files Files Community

kiwhansong commited on 9 days ago

Commit

11554c5

1 Parent(s): 9e24bfb

complete task 1

Browse files

Files changed (5) hide show

.gitignore +4 -0
app.py +203 -4
config.yaml +138 -0
export.py +9 -0
requirements.txt +3 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,4 @@

+huggingface
+.DS_Store
+data/
+__pycache__/

app.py CHANGED Viewed

@@ -1,7 +1,206 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
-demo.launch()

+from pathlib import Path
+import spaces
 import gradio as gr
+import imageio
+import torch
+from PIL import Image
+from omegaconf import OmegaConf
+from algorithms.dfot import DFoTVideoPose
+from utils.ckpt_utils import download_pretrained
+from datasets.video.utils.io import read_video
+from datasets.video import RealEstate10KAdvancedVideoDataset
+from export import export_to_video
+DATASET_DIR = Path("data/real-estate-10k-tiny")
+LONG_LENGTH = 20 # seconds
+metadata = torch.load(DATASET_DIR / "metadata" / "test.pt", weights_only=False)
+video_list = [
+    read_video(path).permute(0, 3, 1, 2) / 255.0 for path in metadata["video_paths"]
+]
+first_frame_list = [
+    (video[0] * 255).permute(1, 2, 0).numpy().clip(0, 255).astype("uint8")
+    for video in video_list
+]
+poses_list = [
+    torch.cat(
+        [
+            poses[:, :4],
+            poses[:, 6:],
+        ],
+        dim=-1,
+    ).to(torch.float32)
+    for poses in (
+        torch.load(DATASET_DIR / "test_poses" / f"{path.stem}.pt")
+        for path in metadata["video_paths"]
+    )
+]
+# pylint: disable-next=no-value-for-parameter
+dfot = DFoTVideoPose.load_from_checkpoint(
+    checkpoint_path=download_pretrained("pretrained:DFoT_RE10K.ckpt"),
+    cfg=OmegaConf.load("config.yaml"),
+).eval()
+dfot.to("cuda")
+def prepare_long_gt_video(idx: int):
+    video = video_list[idx]
+    indices = torch.linspace(0, video.size(0) - 1, LONG_LENGTH * 10, dtype=torch.long)
+    return export_to_video(video[indices], fps=10)
+@spaces.GPU(duration=120)
+@torch.no_grad()
+def single_image_to_long_video(idx: int, guidance_scale: float, fps: int, progress=gr.Progress(track_tqdm=True)):
+    video = video_list[idx]
+    poses = poses_list[idx]
+    indices = torch.linspace(0, video.size(0) - 1, LONG_LENGTH * fps, dtype=torch.long)
+    xs = video[indices].unsqueeze(0).to("cuda")
+    conditions = poses[indices].unsqueeze(0).to("cuda")
+    dfot.cfg.tasks.prediction.history_guidance.guidance_scale = guidance_scale
+    dfot.cfg.tasks.prediction.keyframe_density = 0.6 / fps
+    # dfot.cfg.tasks.interpolation.history_guidance.guidance_scale = guidance_scale
+    gen_video = dfot._unnormalize_x(
+        dfot._predict_videos(
+            dfot._normalize_x(xs),
+            conditions,
+        )
+    )
+    return export_to_video(gen_video[0].detach().cpu(), fps=fps)
+# Create the Gradio Blocks
+with gr.Blocks(theme=gr.themes.Base(primary_hue="teal")) as demo:
+    gr.HTML(
+        """
+    <style>
+    [data-tab-id="task-1"], [data-tab-id="task-2"], [data-tab-id="task-3"] {
+        font-size: 16px !important;
+        font-weight: bold;
+    }
+    </style>
+    """
+    )
+    gr.Markdown("# Diffusion Forcing Transformer and History Guidance")
+    gr.Markdown(
+        "### Official Interactive Demo for [_History-guided Video Diffusion_](todo)"
+    )
+    with gr.Row():
+        gr.Button(value="🌐 Website", link="todo")
+        gr.Button(value="📄 Paper", link="https://boyuan.space/history-guidance")
+        gr.Button(
+            value="💻 Code",
+            link="https://github.com/kwsong0113/diffusion-forcing-transformer",
+        )
+        gr.Button(
+            value="🤗 Pretrained Models", link="https://huggingface.co/kiwhansong/DFoT"
+        )
+    with gr.Tab("Single Image → Long Video", id="task-1"):
+        gr.Markdown(
+            """
+            ## Demo 2: Single Image → Long Video
+            > #### **TL;DR:** _Diffusion Forcing Transformer, with History Guidance, can stably generate long videos, via sliding window rollouts and interpolation._
+        """
+        )
+        stage = gr.State(value="Selection")
+        selected_index = gr.State(value=None)
+        @gr.render(inputs=[stage, selected_index])
+        def render_stage(s, idx):
+            match s:
+                case "Selection":
+                    image_gallery = gr.Gallery(
+                        value=first_frame_list,
+                        label="Select an image to animate",
+                        columns=[8],
+                        selected_index=idx,
+                    )
+                    @image_gallery.select(inputs=None, outputs=selected_index)
+                    def update_selection(selection: gr.SelectData):
+                        return selection.index
+                    select_button = gr.Button("Select")
+                    @select_button.click(inputs=selected_index, outputs=stage)
+                    def move_to_generation(idx: int):
+                        if idx is None:
+                            gr.Warning("Image not selected!")
+                            return "Selection"
+                        else:
+                            return "Generation"
+                case "Generation":
+                    with gr.Row():
+                        gr.Image(value=first_frame_list[idx], label="Input Image")
+                        # gr.Video(value=metadata["video_paths"][idx], label="Ground Truth Video")
+                        gr.Video(value=prepare_long_gt_video(idx), label="Ground Truth Video")
+                        video = gr.Video(label="Generated Video")
+                        with gr.Column():
+                            guidance_scale = gr.Slider(
+                                minimum=1,
+                                maximum=6,
+                                value=4,
+                                step=0.5,
+                                label="History Guidance Scale",
+                                info="Without history guidance: 1.0; Recommended: 4.0",
+                                interactive=True,
+                            )
+                            fps = gr.Slider(
+                                minimum=1,
+                                maximum=10,
+                                value=4,
+                                step=1,
+                                label="FPS",
+                                info=f"A {LONG_LENGTH}-second video will be generated at this FPS; Decrease for faster generation; Increase for a smoother video",
+                                interactive=True,
+                            )
+                            generate_button = gr.Button("Generate Video").click(
+                                fn=single_image_to_long_video,
+                                inputs=[selected_index, guidance_scale, fps],
+                                outputs=video,
+                            )
+                    # def generate_video(idx: int):
+                    #     gr.Video(value=single_image_to_long_video(idx))
+        # Function to update the state with the selected index
+        # def show_warning(selection: gr.SelectData):
+        #     gr.Warning(f"Your choice is #{selection.index}, with image: {selection.value['image']['path']}!")
+        # # image_gallery.select(fn=show_warning, inputs=None)
+        # # Show the generate button only if an image is selected
+        # selected_index.change(fn=lambda idx: idx is not None, inputs=selected_index, outputs=generate_button)
+    with gr.Tab("Any Images → Video", id="task-2"):
+        gr.Markdown(
+            """
+            ## Demo 1: Any Images → Video
+            > #### **TL;DR:** _Diffusion Forcing Transformer is a flexible model that can generate videos given variable number of context frames._
+        """
+        )
+        input_text_1 = gr.Textbox(
+            lines=2, placeholder="Enter text for Video Model 1..."
+        )
+        output_video_1 = gr.Video()
+        generate_button_1 = gr.Button("Generate Video")
+    with gr.Tab("Single Image → Extremely Long Video", id="task-3"):
+        gr.Markdown(
+            """
+            ## Demo 3: Single Image → Extremely Long Video
+            > #### **TL;DR:** _Diffusion Forcing Transformer is a flexible model that can generate videos given **variable number of context frames**._
+        """
+        )
+        input_text_2 = gr.Textbox(
+            lines=2, placeholder="Enter text for Video Model 2..."
+        )
+        output_video_2 = gr.Video()
+        generate_button_2 = gr.Button("Generate Video")
+if __name__ == "__main__":
+    demo.launch()

config.yaml ADDED Viewed

	@@ -0,0 +1,138 @@

+debug: False
+lr: 5e-5
+backbone:
+  name: u_vit3d_pose
+  channels:
+  - 128
+  - 256
+  - 576
+  - 1152
+  emb_channels: 1024
+  patch_size: 2
+  block_types:
+  - ResBlock
+  - ResBlock
+  - TransformerBlock
+  - TransformerBlock
+  block_dropouts:
+  - 0.0
+  - 0.0
+  - 0.1
+  - 0.1
+  num_updown_blocks:
+  - 3
+  - 3
+  - 6
+  num_mid_blocks: 20
+  num_heads: 9
+  pos_emb_type: rope
+  use_checkpointing:
+  - false
+  - false
+  - false
+  - true
+  conditioning:
+    dim: null
+  external_cond_dropout: 0.1
+  use_fourier_noise_embedding: true
+x_shape: [3, 256, 256]
+max_frames: 8
+n_frames: 8
+frame_skip: 1
+context_frames: 1
+latent:
+  enable: False
+  type: pre_sample
+  suffix: null
+  downsampling_factor: [1, 8]
+  num_channels: 4
+data_mean: [[[0.577]], [[0.517]], [[0.461]]]
+data_std: [[[0.249]], [[0.249]], [[0.268]]]
+external_cond_dim: 16
+external_cond_stack: False
+external_cond_processing: null
+compile: false
+weight_decay: 0.01
+optimizer_beta:
+- 0.9
+- 0.99
+lr_scheduler:
+  name: constant_with_warmup
+  num_warmup_steps: 10000
+  num_training_steps: 550000
+noise_level: random_independent
+uniform_future:
+  enabled: false
+fixed_context:
+  enabled: false
+  indices: null
+  dropout: 0
+variable_context:
+  enabled: false
+  prob: 0
+  dropout: 0
+chunk_size: -1
+scheduling_matrix: full_sequence
+replacement: noisy_scale
+diffusion:
+  is_continuous: true
+  timesteps: 1000
+  beta_schedule: cosine_simple_diffusion
+  schedule_fn_kwargs:
+    shift: 1.0
+    shifted: 0.125
+    interpolated: false
+  use_causal_mask: false
+  clip_noise: 20.0
+  objective: pred_v
+  loss_weighting:
+    strategy: sigmoid
+    snr_clip: 5.0
+    cum_snr_decay: 0.9
+    sigmoid_bias: -1.0
+  sampling_timesteps: 50
+  ddim_sampling_eta: 0.0
+  reconstruction_guidance: 0.0
+  training_schedule:
+    name: cosine
+    shift: 0.125
+  precond_scale: 0.125
+vae:
+  pretrained_path: null
+  pretrained_kwargs: {}
+  use_fp16: true
+  batch_size: 2
+checkpoint:
+  reset_optimizer: false
+  strict: true
+tasks:
+  prediction:
+    enabled: true
+    history_guidance:
+      name: stabilized_vanilla
+      guidance_scale: 4.0
+      stabilization_level: 0.02
+      visualize: False
+    keyframe_density: null
+    sliding_context_len: null
+  interpolation:
+    enabled: false
+    history_guidance:
+      name: vanilla
+      guidance_scale: 1.5
+      visualize: False
+    max_batch_size: 1
+logging:
+  deterministic: null
+  loss_freq: 100
+  grad_norm_freq: 100
+  max_num_videos: 256
+  n_metrics_frames: null
+  metrics: []
+  metrics_batch_size: 16
+  sanity_generation: false
+  raw_dir: null
+camera_pose_conditioning:
+  normalize_by: first
+  bound: null
+  type: ray_encoding

export.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import tempfile
+import torch
+from torch import Tensor
+from torchvision.io import write_video
+def export_to_video(tensor: Tensor, fps: int = 10) -> str:
+    path = tempfile.NamedTemporaryFile(suffix=".mp4").name
+    write_video(path, (tensor.permute(0, 2, 3, 1) * 255).clamp(0, 255).to(torch.uint8), fps=fps)
+    return path

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+# gradio
+# spaces
+git+https://github.com/kwsong0113/dfot-test.git@release#egg=dfot # FIXME: change to the official repo