Spaces:

kiwhansong
/

diffusion-forcing-transformer

Running on Zero

App Files Files Community

kiwhansong commited on Feb 11

Commit

5359939

1 Parent(s): eb1feee

finish demo

Browse files

Files changed (3) hide show

app.py +549 -49
camera_pose.py +94 -0
history_guidance.py +24 -0

app.py CHANGED Viewed

@@ -6,19 +6,20 @@ import gradio as gr
 import numpy as np
 import torch
 from torchvision.datasets.utils import download_and_extract_archive
-from PIL import Image
 from omegaconf import OmegaConf
 from algorithms.dfot import DFoTVideoPose
-from algorithms.dfot.history_guidance import HistoryGuidance
 from utils.ckpt_utils import download_pretrained
-from utils.huggingface_utils import download_from_hf
 from datasets.video.utils.io import read_video
-from datasets.video import RealEstate10KAdvancedVideoDataset
 from export import export_to_video, export_to_gif, export_images_to_gif
 DATASET_URL = "https://huggingface.co/kiwhansong/DFoT/resolve/main/datasets/RealEstate10K_Tiny.tar.gz"
 DATASET_DIR = Path("data/real-estate-10k-tiny")
-LONG_LENGTH = 20  # seconds
 if not DATASET_DIR.exists():
     DATASET_DIR.mkdir(parents=True)
@@ -69,8 +70,8 @@ dfot.to("cuda")
 def prepare_long_gt_video(idx: int):
     video = video_list[idx]
-    indices = torch.linspace(0, video.size(0) - 1, LONG_LENGTH * 10, dtype=torch.long)
-    return export_to_video(video[indices], fps=10)
 def prepare_short_gt_video(idx: int):
@@ -104,7 +105,7 @@ def single_image_to_long_video(
     xs = video[indices].unsqueeze(0).to("cuda")
     conditions = poses[indices].unsqueeze(0).to("cuda")
     dfot.cfg.tasks.prediction.history_guidance.guidance_scale = guidance_scale
-    dfot.cfg.tasks.prediction.keyframe_density = 0.6 / fps
     # dfot.cfg.tasks.interpolation.history_guidance.guidance_scale = guidance_scale
     gen_video = dfot._unnormalize_x(
         dfot._predict_videos(
@@ -151,6 +152,228 @@ def any_images_to_short_video(
     return video_to_gif_and_images([image for image in gen_video], list(range(8)))
 # Create the Gradio Blocks
 with gr.Blocks(theme=gr.themes.Base(primary_hue="teal")) as demo:
     gr.HTML(
@@ -160,6 +383,21 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue="teal")) as demo:
         font-size: 16px !important;
         font-weight: bold;
     }
     </style>
     """
     )
@@ -169,14 +407,29 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue="teal")) as demo:
         "### Official Interactive Demo for [_History-guided Video Diffusion_](todo)"
     )
     with gr.Row():
-        gr.Button(value="🌐 Website", link="https://boyuan.space/history-guidance")
-        gr.Button(value="📄 Paper", link="https://arxiv.org/abs/2502.06764")
         gr.Button(
-            value="💻 Code",
             link="https://github.com/kwsong0113/diffusion-forcing-transformer",
         )
         gr.Button(
-            value="🤗 Pretrained Models", link="https://huggingface.co/kiwhansong/DFoT"
         )
     with gr.Accordion("Troubleshooting: Not Working or Too Slow?", open=False):
@@ -187,7 +440,6 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue="teal")) as demo:
             """
         )
     with gr.Tab("Any # of Images → Short Video", id="task-1"):
         gr.Markdown(
             """
@@ -225,7 +477,7 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue="teal")) as demo:
                         def update_selection(selection: gr.SelectData):
                             return selection.index
-                        demo1_scene_select_button = gr.Button("Select Scene")
                         @demo1_scene_select_button.click(
                             inputs=demo1_selected_scene_index, outputs=demo1_stage
@@ -257,7 +509,7 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue="teal")) as demo:
                             choices=[(f"t={i}", i) for i in range(8)],
                             value=[],
                         )
-                        demo1_image_select_button = gr.Button("Select Input Images")
                         @demo1_image_select_button.click(
                             inputs=[demo1_selector],
@@ -304,7 +556,7 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue="teal")) as demo:
                             info="Without history guidance: 1.0; Recommended: 4.0",
                             interactive=True,
                         )
-                        gr.Button("Generate Video").click(
                             fn=any_images_to_short_video,
                             inputs=[
                                 demo1_selected_scene_index,
@@ -316,9 +568,9 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue="teal")) as demo:
     with gr.Tab("Single Image → Long Video", id="task-2"):
         gr.Markdown(
-            """
-            ## Demo 2: Single Image → Long 20-second Video
-            > #### _Diffusion Forcing Transformer, with History Guidance, can generate long videos via sliding window rollouts and temporal super-resolution._
         """
         )
@@ -344,7 +596,7 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue="teal")) as demo:
                         def update_selection(selection: gr.SelectData):
                             return selection.index
-                        demo2_select_button = gr.Button("Select Input Image")
                         @demo2_select_button.click(
                             inputs=demo2_selected_index, outputs=demo2_stage
@@ -369,49 +621,297 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue="teal")) as demo:
                             label="Ground Truth Video",
                             width=256,
                             height=256,
                         )
                         demo2_video = gr.Video(
-                            label="Generated Video", width=256, height=256
                         )
-                        with gr.Sidebar():
-                            gr.Markdown("### Sampling Parameters")
-                            demo2_guidance_scale = gr.Slider(
-                                minimum=1,
-                                maximum=6,
-                                value=4,
-                                step=0.5,
-                                label="History Guidance Scale",
-                                info="Without history guidance: 1.0; Recommended: 4.0",
-                                interactive=True,
                             )
-                            demo2_fps = gr.Slider(
                                 minimum=2,
                                 maximum=10,
-                                value=4,
                                 step=1,
-                                label="FPS",
-                                info=f"A {LONG_LENGTH}-second video will be generated at this FPS; Decrease for faster generation; Increase for a smoother video",
                                 interactive=True,
                             )
-                            gr.Button("Generate Video").click(
-                                fn=single_image_to_long_video,
-                                inputs=[
-                                    demo2_selected_index,
-                                    demo2_guidance_scale,
-                                    demo2_fps,
                                 ],
-                                outputs=demo2_video,
                             )
-    with gr.Tab("Single Image → Extremely Long Video", id="task-3"):
-        gr.Markdown(
-            """
-            ## Demo 3: Single Image → Extremely Long Video
-            > #### _TODO._
-        """
-        )
 if __name__ == "__main__":
     demo.launch()

 import numpy as np
 import torch
 from torchvision.datasets.utils import download_and_extract_archive
+from einops import repeat
 from omegaconf import OmegaConf
 from algorithms.dfot import DFoTVideoPose
+from history_guidance import HistoryGuidance
 from utils.ckpt_utils import download_pretrained
 from datasets.video.utils.io import read_video
 from export import export_to_video, export_to_gif, export_images_to_gif
+from camera_pose import extend_poses, CameraPose
+from scipy.spatial.transform import Rotation, Slerp
 DATASET_URL = "https://huggingface.co/kiwhansong/DFoT/resolve/main/datasets/RealEstate10K_Tiny.tar.gz"
 DATASET_DIR = Path("data/real-estate-10k-tiny")
+LONG_LENGTH = 10  # seconds
+NAVIGATION_FPS = 3
 if not DATASET_DIR.exists():
     DATASET_DIR.mkdir(parents=True)
 def prepare_long_gt_video(idx: int):
     video = video_list[idx]
+    indices = torch.linspace(0, video.size(0) - 1, 200, dtype=torch.long)
+    return export_to_video(video[indices], fps=200 // LONG_LENGTH)
 def prepare_short_gt_video(idx: int):
     xs = video[indices].unsqueeze(0).to("cuda")
     conditions = poses[indices].unsqueeze(0).to("cuda")
     dfot.cfg.tasks.prediction.history_guidance.guidance_scale = guidance_scale
+    dfot.cfg.tasks.prediction.keyframe_density = 12 / (fps * LONG_LENGTH)
     # dfot.cfg.tasks.interpolation.history_guidance.guidance_scale = guidance_scale
     gen_video = dfot._unnormalize_x(
         dfot._predict_videos(
     return video_to_gif_and_images([image for image in gen_video], list(range(8)))
+class CustomProgressBar:
+    def __init__(self, pbar):
+        self.pbar = pbar
+    def set_postfix(self, **kwargs):
+        pass
+    def __getattr__(self, attr):
+        return getattr(self.pbar, attr)
+@torch.autocast("cuda")
+@torch.no_grad()
+def navigate_video(
+    video: torch.Tensor,
+    poses: torch.Tensor,
+    x_angle: float,
+    y_angle: float,
+    distance: float,
+):
+    n_context_frames = min(len(video), 4)
+    n_prediction_frames = 8 - n_context_frames
+    pbar = CustomProgressBar(
+        gr.Progress(track_tqdm=True).tqdm(
+            iterable=None,
+            desc=f"Predicting next {n_prediction_frames} frames",
+            total=dfot.sampling_timesteps,
+        )
+    )
+    xs = dfot._normalize_x(video.clone().unsqueeze(0).to("cuda"))
+    conditions = poses.clone().unsqueeze(0).to("cuda")
+    conditions = extend_poses(
+        conditions,
+        n=n_prediction_frames,
+        x_angle=x_angle,
+        y_angle=y_angle,
+        distance=distance,
+    )
+    context_mask = (
+        torch.cat(
+            [
+                torch.ones(1, n_context_frames) * (1 if n_context_frames == 1 else 2),
+                torch.zeros(1, n_prediction_frames),
+            ],
+            dim=-1,
+        )
+        .long()
+        .to("cuda")
+    )
+    next_video = (
+        dfot._unnormalize_x(
+            dfot._sample_sequence(
+                batch_size=1,
+                context=torch.cat(
+                    [
+                        xs[:, -n_context_frames:],
+                        torch.zeros(
+                            1,
+                            n_prediction_frames,
+                            *xs.shape[2:],
+                            device=xs.device,
+                            dtype=xs.dtype,
+                        ),
+                    ],
+                    dim=1,
+                ),
+                context_mask=context_mask,
+                conditions=conditions[:, -8:],
+                history_guidance=HistoryGuidance.smart(
+                    x_angle=x_angle,
+                    y_angle=y_angle,
+                    distance=distance,
+                    visualize=False,
+                ),
+                pbar=pbar,
+            )[0]
+        )[0][n_context_frames:]
+        .detach()
+        .cpu()
+    )
+    gen_video = torch.cat([video, next_video], dim=0)
+    poses = conditions[0]
+    images = (gen_video.permute(0, 2, 3, 1) * 255).clamp(0, 255).to(torch.uint8).numpy()
+    return (
+        gen_video,
+        poses,
+        images[-1],
+        export_to_video(gen_video, fps=NAVIGATION_FPS),
+        [(image, f"t={i}") for i, image in enumerate(images)],
+    )
+def undo_navigation(
+    video: torch.Tensor,
+    poses: torch.Tensor,
+):
+    if len(video) >= 8:
+        video = video[:-4]
+        poses = poses[:-4]
+    else:
+        gr.Warning("You have no moves left to undo!")
+    images = (video.permute(0, 2, 3, 1) * 255).clamp(0, 255).to(torch.uint8).numpy()
+    return (
+        video,
+        poses,
+        images[-1],
+        export_to_video(video, fps=NAVIGATION_FPS),
+        [(image, f"t={i}") for i, image in enumerate(images)],
+    )
+def _interpolate_conditions(conditions, indices):
+    """
+    Interpolate conditions to fill out missing frames
+    Aegs:
+        conditions (Tensor): conditions (B, T, C)
+        indices (Tensor): indices of keyframes (T')
+    """
+    assert indices[0].item() == 0
+    assert indices[-1].item() == conditions.shape[1] - 1
+    indices = indices.cpu().numpy()
+    batch_size, n_tokens, _ = conditions.shape
+    t = np.linspace(0, n_tokens - 1, n_tokens)
+    key_conditions = conditions[:, indices]
+    poses = CameraPose.from_vectors(key_conditions)
+    extrinsics = poses.extrinsics().cpu().numpy()
+    ps = extrinsics[..., :3, 3]
+    rs = extrinsics[..., :3, :3].reshape(batch_size, -1, 3, 3)
+    interp_extrinsics = np.zeros((batch_size, n_tokens, 3, 4))
+    for i in range(batch_size):
+        slerp = Slerp(indices, Rotation.from_matrix(rs[i]))
+        interp_extrinsics[i, :, :3, :3] = slerp(t).as_matrix()
+        for j in range(3):
+            interp_extrinsics[i, :, j, 3] = np.interp(t, indices, ps[i, :, j])
+    interp_extrinsics = torch.from_numpy(interp_extrinsics.astype(np.float32))
+    interp_extrinsics = interp_extrinsics.to(conditions.device).flatten(2)
+    conditions = repeat(key_conditions[:, 0, :4], "b c -> b t c", t=n_tokens)
+    conditions = torch.cat([conditions.clone(), interp_extrinsics], dim=-1)
+    return conditions
+@spaces.GPU(duration=300)
+@torch.autocast("cuda")
+@torch.no_grad()
+def _interpolate_between(
+    xs: torch.Tensor,
+    conditions: torch.Tensor,
+    interpolation_factor: int,
+    progress=gr.Progress(track_tqdm=True),
+):
+    l = xs.shape[1]
+    final_l = (l - 1) * interpolation_factor + 1
+    x_shape = xs.shape[2:]
+    context = torch.zeros(
+        (
+            1,
+            final_l,
+            *x_shape,
+        ),
+        device=xs.device,
+        dtype=xs.dtype,
+    )
+    long_conditions = torch.zeros(
+        (1, final_l, *conditions.shape[2:]),
+        device=conditions.device,
+        dtype=conditions.dtype,
+    )
+    context_mask = torch.zeros(
+        (1, final_l),
+        device=xs.device,
+        dtype=torch.bool,
+    )
+    context_indices = torch.arange(
+        0, final_l, interpolation_factor, device=conditions.device
+    )
+    context[:, context_indices] = xs
+    long_conditions[:, context_indices] = conditions
+    context_mask[:, ::interpolation_factor] = True
+    long_conditions = _interpolate_conditions(
+        long_conditions,
+        context_indices,
+    )
+    xs = dfot._interpolate_videos(
+        context,
+        context_mask,
+        conditions=long_conditions,
+    )
+    return xs, long_conditions
+def smooth_navigation(
+    video: torch.Tensor,
+    poses: torch.Tensor,
+    interpolation_factor: int,
+    progress=gr.Progress(track_tqdm=True),
+):
+    if len(video) < 8:
+        gr.Warning("Navigate first before applying temporal super-resolution!")
+    else:
+        video, poses = _interpolate_between(
+            dfot._normalize_x(video.clone().unsqueeze(0).to("cuda")),
+            poses.clone().unsqueeze(0).to("cuda"),
+            interpolation_factor,
+        )
+        video = dfot._unnormalize_x(video)[0].detach().cpu()
+        poses = poses[0]
+    images = (video.permute(0, 2, 3, 1) * 255).clamp(0, 255).to(torch.uint8).numpy()
+    return (
+        video,
+        poses,
+        images[-1],
+        export_to_video(video, fps=NAVIGATION_FPS * interpolation_factor),
+        [(image, f"t={i}") for i, image in enumerate(images)],
+    )
 # Create the Gradio Blocks
 with gr.Blocks(theme=gr.themes.Base(primary_hue="teal")) as demo:
     gr.HTML(
         font-size: 16px !important;
         font-weight: bold;
     }
+    #header-button .button-icon {
+        margin-right: 8px;
+    }
+    #basic-controls {
+        column-gap: 0px;
+    }
+    #basic-controls button {
+        border: 1px solid #e4e4e7;
+    }
+    #basic-controls-tab {
+        padding: 0px;
+    }
+    #advanced-controls-tab {
+        padding: 0px;
+    }
     </style>
     """
     )
         "### Official Interactive Demo for [_History-guided Video Diffusion_](todo)"
     )
     with gr.Row():
         gr.Button(
+            value="Website",
+            link="https://boyuan.space/history-guidance",
+            icon="https://simpleicons.org/icons/googlechrome.svg",
+            elem_id="header-button",
+        )
+        gr.Button(
+            value="Paper",
+            link="https://arxiv.org/abs/2502.06764",
+            icon="https://simpleicons.org/icons/arxiv.svg",
+            elem_id="header-button",
+        )
+        gr.Button(
+            value="Code",
             link="https://github.com/kwsong0113/diffusion-forcing-transformer",
+            icon="https://simpleicons.org/icons/github.svg",
+            elem_id="header-button",
         )
         gr.Button(
+            value="Pretrained Models",
+            link="https://huggingface.co/kiwhansong/DFoT",
+            icon="https://simpleicons.org/icons/huggingface.svg",
+            elem_id="header-button",
         )
     with gr.Accordion("Troubleshooting: Not Working or Too Slow?", open=False):
             """
         )
     with gr.Tab("Any # of Images → Short Video", id="task-1"):
         gr.Markdown(
             """
                         def update_selection(selection: gr.SelectData):
                             return selection.index
+                        demo1_scene_select_button = gr.Button("Select Scene", variant="primary")
                         @demo1_scene_select_button.click(
                             inputs=demo1_selected_scene_index, outputs=demo1_stage
                             choices=[(f"t={i}", i) for i in range(8)],
                             value=[],
                         )
+                        demo1_image_select_button = gr.Button("Select Input Images", variant="primary")
                         @demo1_image_select_button.click(
                             inputs=[demo1_selector],
                             info="Without history guidance: 1.0; Recommended: 4.0",
                             interactive=True,
                         )
+                        gr.Button("Generate Video", variant="primary").click(
                             fn=any_images_to_short_video,
                             inputs=[
                                 demo1_selected_scene_index,
     with gr.Tab("Single Image → Long Video", id="task-2"):
         gr.Markdown(
+            f"""
+            ## Demo 2: Single Image → Long {LONG_LENGTH}-second Video
+            > #### _Diffusion Forcing Transformer, with History Guidance, generates long videos via sliding window rollouts and temporal super-resolution._
         """
         )
                         def update_selection(selection: gr.SelectData):
                             return selection.index
+                        demo2_select_button = gr.Button("Select Input Image", variant="primary")
                         @demo2_select_button.click(
                             inputs=demo2_selected_index, outputs=demo2_stage
                             label="Ground Truth Video",
                             width=256,
                             height=256,
+                            autoplay=True,
+                            loop=True,
                         )
                         demo2_video = gr.Video(
+                            label="Generated Video",
+                            width=256,
+                            height=256,
+                            autoplay=True,
+                            loop=True,
+                            show_share_button=True,
+                            show_download_button=True,
                         )
+                    with gr.Sidebar():
+                        gr.Markdown("### Sampling Parameters")
+                        demo2_guidance_scale = gr.Slider(
+                            minimum=1,
+                            maximum=6,
+                            value=4,
+                            step=0.5,
+                            label="History Guidance Scale",
+                            info="Without history guidance: 1.0; Recommended: 4.0",
+                            interactive=True,
+                        )
+                        demo2_fps = gr.Slider(
+                            minimum=4,
+                            maximum=20,
+                            value=8,
+                            step=1,
+                            label="FPS",
+                            info=f"A {LONG_LENGTH}-second video will be generated at this FPS; Decrease for faster generation; Increase for a smoother video",
+                            interactive=True,
+                        )
+                        gr.Button("Generate Video", variant="primary").click(
+                            fn=single_image_to_long_video,
+                            inputs=[
+                                demo2_selected_index,
+                                demo2_guidance_scale,
+                                demo2_fps,
+                            ],
+                            outputs=demo2_video,
+                        )
+    with gr.Tab("Single Image → Endless Video Navigation", id="task-3"):
+        gr.Markdown(
+            """
+            ## Demo 3: Single Image → Extremely Long Video _(Navigate with Your Camera Movements!)_
+            > #### _History Guidance significantly improves quality and temporal consistency, enabling stable rollouts for extremely long videos._
+        """
+        )
+        demo3_stage = gr.State(value="Selection")
+        demo3_selected_index = gr.State(value=None)
+        demo3_current_video = gr.State(value=None)
+        demo3_current_poses = gr.State(value=None)
+        @gr.render(inputs=[demo3_stage, demo3_selected_index])
+        def render_stage(s, idx):
+            match s:
+                case "Selection":
+                    with gr.Group():
+                        demo3_image_gallery = gr.Gallery(
+                            height=300,
+                            value=first_frame_list,
+                            label="Select an Image to Start Navigation",
+                            columns=[8],
+                            selected_index=idx,
+                        )
+                        @demo3_image_gallery.select(
+                            inputs=None, outputs=demo3_selected_index
+                        )
+                        def update_selection(selection: gr.SelectData):
+                            return selection.index
+                        demo3_select_button = gr.Button("Select Input Image", variant="primary")
+                        @demo3_select_button.click(
+                            inputs=demo3_selected_index,
+                            outputs=[
+                                demo3_stage,
+                                demo3_current_video,
+                                demo3_current_poses,
+                            ],
+                        )
+                        def move_to_generation(idx: int):
+                            if idx is None:
+                                gr.Warning("Image not selected!")
+                                return "Selection", None, None
+                            else:
+                                return (
+                                    "Generation",
+                                    video_list[idx][:1],
+                                    poses_list[idx][:1],
+                                )
+                case "Generation":
+                    with gr.Row():
+                        demo3_current_view = gr.Image(
+                            value=first_frame_list[idx],
+                            label="Current View",
+                            width=256,
+                            height=256,
+                        )
+                        demo3_video = gr.Video(
+                            label="Generated Video",
+                            width=256,
+                            height=256,
+                            autoplay=True,
+                            loop=True,
+                            show_share_button=True,
+                            show_download_button=True,
+                        )
+                    demo3_generated_gallery = gr.Gallery(
+                        value=[],
+                        label="Generated Frames",
+                        columns=[8],
+                    )
+                    with gr.Sidebar():
+                        gr.Markdown(
+                            """
+                            ### Let's Navigate!
+                            **The model will predict the next few frames based on your camera movements. Repeat the process to navigate through the scene.** The most suitable history guidance scheme will be automatically selected based on your camera movements.
+                        """
+                        )
+                        with gr.Tab("Basic", elem_id="basic-controls-tab"):
+                            with gr.Group():
+                                gr.Markdown("_**Select a direction to move:**_")
+                                with gr.Row(elem_id="basic-controls"):
+                                    gr.Button("↰-60°\nTurn", size="sm", min_width=0, variant="primary").click(
+                                        fn=partial(
+                                            navigate_video,
+                                            x_angle=0,
+                                            y_angle=-60,
+                                            distance=0,
+                                        ),
+                                        inputs=[demo3_current_video, demo3_current_poses],
+                                        outputs=[
+                                            demo3_current_video,
+                                            demo3_current_poses,
+                                            demo3_current_view,
+                                            demo3_video,
+                                            demo3_generated_gallery,
+                                        ],
+                                    )
+                                    gr.Button("↖-30°\nVeer", size="sm", min_width=0, variant="primary").click(
+                                        fn=partial(
+                                            navigate_video,
+                                            x_angle=0,
+                                            y_angle=-30,
+                                            distance=50,
+                                        ),
+                                        inputs=[demo3_current_video, demo3_current_poses],
+                                        outputs=[
+                                            demo3_current_video,
+                                            demo3_current_poses,
+                                            demo3_current_view,
+                                            demo3_video,
+                                            demo3_generated_gallery,
+                                        ],
+                                    )
+                                    gr.Button("↑0°\nAhead", size="sm", min_width=0, variant="primary").click(
+                                        fn=partial(
+                                            navigate_video,
+                                            x_angle=0,
+                                            y_angle=0,
+                                            distance=100,
+                                        ),
+                                        inputs=[demo3_current_video, demo3_current_poses],
+                                        outputs=[
+                                            demo3_current_video,
+                                            demo3_current_poses,
+                                            demo3_current_view,
+                                            demo3_video,
+                                            demo3_generated_gallery,
+                                        ],
+                                    )
+                                    gr.Button("↗30°\nVeer", size="sm", min_width=0, variant="primary").click(
+                                        fn=partial(
+                                            navigate_video,
+                                            x_angle=0,
+                                            y_angle=30,
+                                            distance=50,
+                                        ),
+                                        inputs=[demo3_current_video, demo3_current_poses],
+                                        outputs=[
+                                            demo3_current_video,
+                                            demo3_current_poses,
+                                            demo3_current_view,
+                                            demo3_video,
+                                            demo3_generated_gallery,
+                                        ],
+                                    )
+                                    gr.Button("↱\n60° Turn", size="sm", min_width=0, variant="primary").click(
+                                        fn=partial(
+                                            navigate_video,
+                                            x_angle=0,
+                                            y_angle=60,
+                                            distance=0,
+                                        ),
+                                        inputs=[demo3_current_video, demo3_current_poses],
+                                        outputs=[
+                                            demo3_current_video,
+                                            demo3_current_poses,
+                                            demo3_current_view,
+                                            demo3_video,
+                                            demo3_generated_gallery,
+                                        ],
+                                    )
+                        with gr.Tab("Advanced", elem_id="advanced-controls-tab"):
+                            with gr.Group():
+                                gr.Markdown("_**Select angles and distance:**_")
+                                demo3_y_angle = gr.Slider(
+                                    minimum=-90,
+                                    maximum=90,
+                                    value=0,
+                                    step=10,
+                                    label="Horizontal Angle",
+                                    interactive=True,
+                                )
+                                demo3_x_angle = gr.Slider(
+                                    minimum=-40,
+                                    maximum=40,
+                                    value=0,
+                                    step=10,
+                                    label="Vertical Angle",
+                                    interactive=True,
+                                )
+                                demo3_distance = gr.Slider(
+                                    minimum=0,
+                                    maximum=200,
+                                    value=100,
+                                    step=10,
+                                    label="Distance",
+                                    interactive=True,
+                                )
+                                gr.Button("Generate Next Move", variant="primary").click(
+                                    fn=partial(
+                                        navigate_video,
+                                    ),
+                                    inputs=[demo3_current_video, demo3_current_poses, demo3_x_angle, demo3_y_angle, demo3_distance],
+                                    outputs=[
+                                        demo3_current_video,
+                                        demo3_current_poses,
+                                        demo3_current_view,
+                                        demo3_video,
+                                        demo3_generated_gallery,
+                                    ],
+                                )
+                        with gr.Group():
+                            gr.Markdown("_You can always undo your last move:_")
+                            gr.Button("Undo Last Move", variant="huggingface").click(
+                                fn=undo_navigation,
+                                inputs=[demo3_current_video, demo3_current_poses],
+                                outputs=[
+                                    demo3_current_video,
+                                    demo3_current_poses,
+                                    demo3_current_view,
+                                    demo3_video,
+                                    demo3_generated_gallery,
+                                ],
                             )
+                        with gr.Group():
+                            gr.Markdown("_At the end, apply temporal super-resolution to obtain a smoother video:_")
+                            demo3_interpolation_factor=gr.Slider(
                                 minimum=2,
                                 maximum=10,
+                                value=2,
                                 step=1,
+                                label="Interpolation Factor",
                                 interactive=True,
                             )
+                            gr.Button("Smooth Out Video", variant="huggingface").click(
+                                fn=smooth_navigation,
+                                inputs=[demo3_current_video, demo3_current_poses, demo3_interpolation_factor],
+                                outputs=[
+                                    demo3_current_video,
+                                    demo3_current_poses,
+                                    demo3_current_view,
+                                    demo3_video,
+                                    demo3_generated_gallery,
                                 ],
                             )
 if __name__ == "__main__":
     demo.launch()

camera_pose.py ADDED Viewed

	@@ -0,0 +1,94 @@

+import torch
+from utils.geometry_utils import CameraPose
+from einops import rearrange, repeat
+import math
+import roma
+class ControllableCameraPose(CameraPose):
+    def to_vectors(self) -> torch.Tensor:
+        """
+        Returns the raw camera poses.
+        Returns:
+            torch.Tensor: The raw camera poses. Shape (B, T, 4 + 12).
+        """
+        RT = torch.cat([self._R, rearrange(self._T, "b t i -> b t i 1")], dim=-1)
+        return torch.cat([self._K, rearrange(RT, "b t i j -> b t (i j)")], dim=-1)
+    def extend(
+        self,
+        num_frames: int,
+        x_angle: float = 0.0,
+        y_angle: float = 0.0,
+        distance: float = 100.0,
+    ) -> None:
+        """
+        Extends the camera poses.
+        Let's say 0 degree is the direction of the last camera pose.
+        Smoothly Move & rotate the camera poses in the direction of the given angle (clockwise) in a 2D plane.
+        Args:
+            num_frames (int): The number of frames to extend.
+            x_angle (float): The angle to extend. The angle is in degrees.
+            y_angle (float): The angle to extend. The angle is in degrees.
+        """
+        MOVING_SCALE = 0.5 * distance / 100
+        self._normalize_by(self._R[:, -1], self._T[:, -1])
+        # first compute relative poses for the final n + num_frames th frame
+        # compute the rotation matrix for the given angle
+        R_final = roma.euler_to_rotmat(
+            convention="xyz",
+            angles=torch.tensor(
+                [-x_angle, -y_angle, 0], device=self._R.device, dtype=torch.float32
+            ),
+            degrees=True,
+            dtype=torch.float32,
+            device=self._R.device,
+        ).unsqueeze(0)
+        # compute the translation vector for the given angle
+        T_final = torch.tensor(
+            [
+                -MOVING_SCALE * num_frames * math.sin(math.radians(y_angle)),
+                MOVING_SCALE * num_frames * math.sin(math.radians(x_angle)),
+                -MOVING_SCALE * num_frames * math.cos(math.radians(y_angle)),
+            ],
+            device=self._T.device,
+            dtype=self._T.dtype,
+        ).unsqueeze(0)
+        R = torch.cat(
+            [self._R, repeat(R_final, "b i j -> b t i j", t=num_frames).clone()], dim=1
+        )
+        T = torch.cat(
+            [self._T, repeat(T_final, "b i -> b t i", t=num_frames).clone()], dim=1
+        )
+        K = torch.cat(
+            [self._K, repeat(self._K[:, -1], "b i -> b t i", t=num_frames).clone()],
+            dim=1,
+        )
+        self._R = R
+        self._T = T
+        self._K = K
+        # interpolate all frames btwn the last frame and the final frame
+        self.replace_with_interpolation(
+            torch.cat(
+                [
+                    torch.zeros_like(self._T[:, :-num_frames, 0]),
+                    torch.ones_like(self._T[:, -num_frames:-1, 0]),
+                    torch.zeros_like(self._T[:, -1:, 0]),
+                ],
+                dim=-1,
+            ).bool()
+        )
+def extend_poses(
+    conditions: torch.Tensor,
+    n: int,
+    x_angle: float = 0.0,
+    y_angle: float = 0.0,
+    distance: float = 0.0,
+) -> torch.Tensor:
+    poses = ControllableCameraPose.from_vectors(conditions)
+    poses.extend(n, x_angle, y_angle, distance)
+    return poses.to_vectors()

history_guidance.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from algorithms.dfot.history_guidance import HistoryGuidance as _HistoryGuidance
+class HistoryGuidance(_HistoryGuidance):
+    @classmethod
+    def smart(
+        cls,
+        x_angle: float,
+        y_angle: float,
+        distance: float,
+        visualize: bool = False,
+    ):
+        if abs(x_angle) < 30 and abs(y_angle) < 30 and distance < 150:
+            return cls.stabilized_fractional(
+                guidance_scale=4.0,
+                stabilization_level=0.02,
+                freq_scale=0.4,
+                visualize=visualize,
+            )
+        else:
+            return cls.stabilized_vanilla(
+                guidance_scale=4.0,
+                stabilization_level=0.02,
+                visualize=visualize,
+            )