Spaces:

alexnasa
/

Ctrl-Crash

Paused

App Files Files Community

alexnasa commited on Sep 21

Commit

8e16429

verified ·

1 Parent(s): 1ee8ca8

Upload 52 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +4 -0
LICENSE +21 -0
config/a100l.yaml +13 -0
config/multi_gpu.yaml +17 -0
etc/architecture_figure.png +3 -0
etc/genvid_57_11_04453.gif +3 -0
etc/genvid_64_48_08386.gif +3 -0
etc/genvid_87_21_08924.gif +3 -0
one_sample.ipynb +19 -0
run_gen_videos.py +44 -0
scripts/controlnet_train_action_multigpu.sh +58 -0
scripts/controlnet_train_action_singlegpu.sh +58 -0
scripts/mmau_train_video_diffusion_multigpu.sh +52 -0
scripts/mmau_train_video_diffusion_singlegpu.sh +53 -0
src/__init__.py +0 -0
src/__pycache__/__init__.cpython-310.pyc +0 -0
src/datasets/base_dataset.py +189 -0
src/datasets/bbox_utils.py +68 -0
src/datasets/bdd100k_dataset.py +185 -0
src/datasets/dada2000_dataset.py +339 -0
src/datasets/dataset_factory.py +36 -0
src/datasets/dataset_utils.py +50 -0
src/datasets/merged_dataset.py +54 -0
src/datasets/mmau_dataset.py +549 -0
src/datasets/nuscenes_dataset.py +298 -0
src/datasets/russia_crash_dataset.py +173 -0
src/eval/README.md +120 -0
src/eval/__pycache__/generate_samples.cpython-310.pyc +0 -0
src/eval/generate_samples.py +394 -0
src/eval/video_dataset.py +79 -0
src/eval/video_quality_metrics_fvd_gt_rand.py +458 -0
src/eval/video_quality_metrics_fvd_pair.py +349 -0
src/eval/video_quality_metrics_jedi_gt_rand.py +91 -0
src/eval/video_quality_metrics_jedi_pair.py +92 -0
src/models/__init__.py +2 -0
src/models/controlnet.py +391 -0
src/models/unet_spatio_temporal_condition.py +169 -0
src/pipelines/__init__.py +4 -0
src/pipelines/pipeline_video_control.py +408 -0
src/pipelines/pipeline_video_control_factor_guidance.py +615 -0
src/pipelines/pipeline_video_control_nullmodel.py +406 -0
src/pipelines/pipeline_video_diffusion.py +305 -0
src/preprocess/README.md +105 -0
src/preprocess/filter_dataset_tool.py +315 -0
src/preprocess/preprocess_cap_dataset.py +224 -0
src/preprocess/preprocess_dada_dataset.py +222 -0
src/preprocess/preprocess_russia_dataset.py +168 -0
src/preprocess/yolo_sam.py +584 -0
src/utils/__init__.py +2 -0
src/utils/parser.py +472 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+etc/architecture_figure.png filter=lfs diff=lfs merge=lfs -text
+etc/genvid_57_11_04453.gif filter=lfs diff=lfs merge=lfs -text
+etc/genvid_64_48_08386.gif filter=lfs diff=lfs merge=lfs -text
+etc/genvid_87_21_08924.gif filter=lfs diff=lfs merge=lfs -text

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2025
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

config/a100l.yaml ADDED Viewed

	@@ -0,0 +1,13 @@

+compute_environment: LOCAL_MACHINE
+deepspeed_config: {}
+distributed_type: NO
+fsdp_config: {}
+machine_rank: 0
+main_process_ip: null
+main_process_port: null
+main_training_function: main
+mixed_precision: fp16
+num_machines: 1
+num_processes: 1
+use_cpu: false
+gpu_ids: all

config/multi_gpu.yaml ADDED Viewed

	@@ -0,0 +1,17 @@

+compute_environment: LOCAL_MACHINE
+debug: true
+distributed_type: MULTI_GPU
+downcast_bf16: 'no'
+enable_cpu_affinity: false
+gpu_ids: all
+machine_rank: 0
+main_training_function: main
+mixed_precision: fp16
+num_machines: 1
+num_processes: 4
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false

etc/architecture_figure.png ADDED Viewed

Git LFS Details

SHA256: bc7fb8e3c488eaca947d612cb381338219d993330518db454ab18ae889e8513b
Pointer size: 131 Bytes
Size of remote file: 332 kB

etc/genvid_57_11_04453.gif ADDED Viewed

Git LFS Details

SHA256: 0af33f0855619ecea683552fdedb2f66df9ae6d953e9421e1e4dd03ffe8cde4a
Pointer size: 132 Bytes
Size of remote file: 1.88 MB

etc/genvid_64_48_08386.gif ADDED Viewed

Git LFS Details

SHA256: 2df6a977385d2dd05e6c624ae0d13f05ddfb832c302e44b4de97d5d347611a94
Pointer size: 132 Bytes
Size of remote file: 1.5 MB

etc/genvid_87_21_08924.gif ADDED Viewed

Git LFS Details

SHA256: bfb4e57880ce7d208a401eb1d78537173b60b20882d59a8313fe7e96ab1c4861
Pointer size: 132 Bytes
Size of remote file: 1.39 MB

one_sample.ipynb ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dd192fe4",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

run_gen_videos.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import argparse #test
+from src.eval.generate_samples import generate_samples
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Generate test samples from MMAU dataset")
+    parser.add_argument('--model_path', type=str, required=True, help='Model checkpoint used for generation')
+    parser.add_argument('--data_root', type=str, required=True, help='Dataset root path')
+    parser.add_argument('--output_path', type=str, default="./output_videos", help='Video output path')
+    parser.add_argument('--disable_null_model', action="store_true", default=False, help='For uncond noise preds, whether to use a null model')
+    parser.add_argument('--use_factor_guidance', action="store_true", default=False, help='')
+    parser.add_argument('--num_demo_samples', type=int, default=10, help='Number of samples to collect for generation')
+    parser.add_argument('--max_output_vids', type=int, default=200, help='Exit program once this many videos have been generated')
+    parser.add_argument('--num_gens_per_sample', type=int, default=1, help='Number videos to generate for each test case')
+    parser.add_argument('--eval_output', action="store_true", default=False, help='')
+    parser.add_argument('--seed', type=int, default=None, help='')
+    parser.add_argument('--dataset', type=str, default="mmau")
+    parser.add_argument(
+        "--bbox_mask_idx_batch",
+        nargs="+",
+        type=int,
+        default=[None],
+        choices=list(range(25+1)),
+        help="Where to start the masking, multiple values represent multiple different test cases for each sample",
+    )
+    parser.add_argument(
+        "--force_action_type_batch",
+        nargs="+",
+        type=int,
+        default=[None],
+        choices=[0, 1, 2, 3, 4],
+        help="Which action type to force, multiple values represent multiple different test cases for each sample",
+    )
+    parser.add_argument(
+        "--guidance_scales",
+        nargs="+",
+        type=int,
+        default=[(1, 9)],
+        help="Guidance progression to use, multiple values represent multiple different test cases for each sample",
+    )
+    args = parser.parse_args()
+    generate_samples(args)

scripts/controlnet_train_action_multigpu.sh ADDED Viewed

	@@ -0,0 +1,58 @@

+# nvidia-smi | grep 'python' | awk '{ print $5 }' | xargs -n1 kill -9
+# User-specific paths and settings
+DATASET_PATH="<path/to/datasets>"  # e.g., "/home/datasets_root"
+NAME="<experiment_name>"           # e.g., "box2video_experiment1"
+OUT_DIR="<path/to/output>/${NAME}" # e.g., "/home/results/${NAME}"
+PROJECT_NAME='<wandb_project_name>' # e.g., 'car_crash'
+WANDB_ENTITY='<wandb_username>'    # Your Weights & Biases username
+PRETRAINED_MODEL_PATH="<path/to/pretrained/model>" # e.g., "/home/checkpoints_root/checkpoint"
+# export HF_HOME=/path/to/root # Where the SVD pretrained models are/will be downloaded
+# Create output directory
+mkdir -p $OUT_DIR
+# Save training script for reference
+SCRIPT_PATH=$0
+SAVE_SCRIPT_PATH="${OUT_DIR}/train_scripts.sh"
+cp $SCRIPT_PATH $SAVE_SCRIPT_PATH
+echo "Saved script to ${SAVE_SCRIPT_PATH}"
+# Training command
+CUDA_LAUNCH_BLOCKING=1 accelerate launch --config_file config/multi_gpu.yaml train_video_controlnet.py \
+    --run_name $NAME \
+    --data_root $DATASET_PATH \
+    --project_name $PROJECT_NAME \
+    --pretrained_model_name_or_path $PRETRAINED_MODEL_PATH \
+    --output_dir $OUT_DIR \
+    --variant fp16 \
+    --dataset_name mmau \
+    --train_batch_size 1 \
+    --learning_rate 4e-5 \
+    --checkpoints_total_limit 3 \
+    --checkpointing_steps 300 \
+    --checkpointing_time 10620 \
+    --gradient_accumulation_steps 5 \
+    --validation_steps 300 \
+    --enable_gradient_checkpointing \
+    --lr_scheduler constant \
+    --report_to wandb \
+    --seed 1234 \
+    --mixed_precision fp16 \
+    --clip_length 25 \
+    --fps 6 \
+    --min_guidance_scale 1.0 \
+    --max_guidance_scale 3.0 \
+    --noise_aug_strength 0.01 \
+    --num_demo_samples 15 \
+    --num_train_epochs 10 \
+    --dataloader_num_workers 0 \
+    --resume_from_checkpoint latest \
+    --wandb_entity $WANDB_ENTITY \
+    --train_H 320 \
+    --train_W 512 \
+    --use_action_conditioning \
+    --contiguous_bbox_masking_prob 0.75 \
+    --contiguous_bbox_masking_start_ratio 0.0 \
+    --val_on_first_step

scripts/controlnet_train_action_singlegpu.sh ADDED Viewed

	@@ -0,0 +1,58 @@

+# nvidia-smi | grep 'python' | awk '{ print $5 }' | xargs -n1 kill -9
+# User-specific paths and settings
+DATASET_PATH="<path/to/datasets>"  # e.g., "/home/datasets_root"
+NAME="<experiment_name>"           # e.g., "box2video_experiment1"
+OUT_DIR="<path/to/output>/${NAME}" # e.g., "/home/results/${NAME}"
+PROJECT_NAME='<wandb_project_name>' # e.g., 'car_crash'
+WANDB_ENTITY='<wandb_username>'    # Your Weights & Biases username
+PRETRAINED_MODEL_PATH="<path/to/pretrained/model>" # e.g., "/path/to/pretrained/checkpoint"
+# export HF_HOME=/path/to/root # Where the SVD pretrained models are/will be downloaded
+# Create output directory
+mkdir -p $OUT_DIR
+# Save training script for reference
+SCRIPT_PATH=$0
+SAVE_SCRIPT_PATH="${OUT_DIR}/train_scripts.sh"
+cp $SCRIPT_PATH $SAVE_SCRIPT_PATH
+echo "Saved script to ${SAVE_SCRIPT_PATH}"
+# Training command
+CUDA_LAUNCH_BLOCKING=1 accelerate launch --config_file config/a100l.yaml train_video_controlnet.py \
+    --run_name $NAME \
+    --data_root $DATASET_PATH \
+    --project_name $PROJECT_NAME \
+    --pretrained_model_name_or_path $PRETRAINED_MODEL_PATH \
+    --output_dir $OUT_DIR \
+    --variant fp16 \
+    --dataset_name mmau \
+    --train_batch_size 1 \
+    --learning_rate 4e-5 \
+    --checkpoints_total_limit 3 \
+    --checkpointing_steps 300 \
+    --checkpointing_time 10620 \
+    --gradient_accumulation_steps 5 \
+    --validation_steps 300 \
+    --enable_gradient_checkpointing \
+    --lr_scheduler constant \
+    --report_to wandb \
+    --seed 1234 \
+    --mixed_precision fp16 \
+    --clip_length 25 \
+    --fps 6 \
+    --min_guidance_scale 1.0 \
+    --max_guidance_scale 3.0 \
+    --noise_aug_strength 0.01 \
+    --num_demo_samples 15 \
+    --num_train_epochs 10 \
+    --dataloader_num_workers 0 \
+    --resume_from_checkpoint latest \
+    --wandb_entity $WANDB_ENTITY \
+    --train_H 320 \
+    --train_W 512 \
+    --use_action_conditioning \
+    --contiguous_bbox_masking_prob 0.75 \
+    --contiguous_bbox_masking_start_ratio 0.0 \
+    --val_on_first_step

scripts/mmau_train_video_diffusion_multigpu.sh ADDED Viewed

	@@ -0,0 +1,52 @@

+# nvidia-smi | grep 'python' | awk '{ print $5 }' | xargs -n1 kill -9
+# User-specific paths and settings
+DATASET_PATH="<path/to/datasets>"  # e.g., "/home/datasets_root"
+NAME="<experiment_name>"           # e.g., "box2video_experiment1"
+OUT_DIR="<path/to/output>/${NAME}" # e.g., "/home/results/${NAME}"
+PROJECT_NAME='<wandb_project_name>' # e.g., 'car_crash'
+WANDB_ENTITY='<wandb_username>'    # Your Weights & Biases username
+PRETRAINED_MODEL_PATH="stabilityai/stable-video-diffusion-img2vid-xt" # HuggingFace model ID
+# export HF_HOME=/path/to/root # Where the SVD pretrained models are/will be downloaded
+# Create output directory
+mkdir -p $OUT_DIR
+# Save training script for reference
+SCRIPT_PATH=$0
+SAVE_SCRIPT_PATH="${OUT_DIR}/train_scripts.sh"
+cp $SCRIPT_PATH $SAVE_SCRIPT_PATH
+echo "Saved script to ${SAVE_SCRIPT_PATH}"
+# Training command
+CUDA_LAUNCH_BLOCKING=1 accelerate launch --config_file config/multi_gpu.yaml train_video_diffusion.py \
+    --run_name $NAME \
+    --data_root $DATASET_PATH \
+    --project_name $PROJECT_NAME \
+    --pretrained_model_name_or_path $PRETRAINED_MODEL_PATH \
+    --output_dir $OUT_DIR \
+    --variant fp16 \
+    --dataset_name mmau \
+    --train_batch_size 1 \
+    --learning_rate 1e-5 \
+    --checkpoints_total_limit 3 \
+    --checkpointing_steps 300 \
+    --gradient_accumulation_steps 5 \
+    --validation_steps 300 \
+    --enable_gradient_checkpointing \
+    --lr_scheduler constant \
+    --report_to wandb \
+    --seed 1234 \
+    --mixed_precision fp16 \
+    --clip_length 25 \
+    --min_guidance_scale 1.0 \
+    --max_guidance_scale 3.0 \
+    --noise_aug_strength 0.01 \
+    --num_demo_samples 15 \
+    --backprop_temporal_blocks_start_iter -1 \
+    --num_train_epochs 30 \
+    --train_H 320 \
+    --train_W 512 \
+    --resume_from_checkpoint latest \
+    --wandb_entity $WANDB_ENTITY

scripts/mmau_train_video_diffusion_singlegpu.sh ADDED Viewed

	@@ -0,0 +1,53 @@

+# nvidia-smi | grep 'python' | awk '{ print $5 }' | xargs -n1 kill -9
+# User-specific paths and settings
+DATASET_PATH="<path/to/datasets>"  # e.g., "/home/datasets_root"
+NAME="<experiment_name>"           # e.g., "box2video_experiment1"
+OUT_DIR="<path/to/output>/${NAME}" # e.g., "/home/results/${NAME}"
+PROJECT_NAME='<wandb_project_name>' # e.g., 'car_crash'
+WANDB_ENTITY='<wandb_username>'    # Your Weights & Biases usernames
+PRETRAINED_MODEL_PATH="stabilityai/stable-video-diffusion-img2vid-xt" # HuggingFace model ID
+# export HF_HOME=/path/to/root # Where the SVD pretrained models are/will be downloaded
+# Create output directory
+mkdir -p $OUT_DIR
+# Save training script for reference
+SCRIPT_PATH=$0
+SAVE_SCRIPT_PATH="${OUT_DIR}/train_scripts.sh"
+cp $SCRIPT_PATH $SAVE_SCRIPT_PATH
+echo "Saved script to ${SAVE_SCRIPT_PATH}"
+# Training command
+CUDA_LAUNCH_BLOCKING=1 accelerate launch --config_file config/a100l.yaml train_video_diffusion.py \
+    --run_name $NAME \
+    --data_root $DATASET_PATH \
+    --project_name $PROJECT_NAME \
+    --pretrained_model_name_or_path $PRETRAINED_MODEL_PATH \
+    --output_dir $OUT_DIR \
+    --variant fp16 \
+    --dataset_name mmau \
+    --train_batch_size 1 \
+    --learning_rate 1e-5 \
+    --checkpoints_total_limit 3 \
+    --checkpointing_steps 300 \
+    --gradient_accumulation_steps 5 \
+    --validation_steps 300 \
+    --enable_gradient_checkpointing \
+    --lr_scheduler constant \
+    --report_to wandb \
+    --seed 1234 \
+    --mixed_precision fp16 \
+    --clip_length 25 \
+    --min_guidance_scale 1.0 \
+    --max_guidance_scale 3.0 \
+    --noise_aug_strength 0.01 \
+    --bbox_dropout_prob 0.1 \
+    --num_demo_samples 15 \
+    --backprop_temporal_blocks_start_iter -1 \
+    --num_train_epochs 30 \
+    --train_H 320 \
+    --train_W 512 \
+    --resume_from_checkpoint latest \
+    --wandb_entity $WANDB_ENTITY

src/__init__.py ADDED Viewed

File without changes

src/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (153 Bytes). View file

src/datasets/base_dataset.py ADDED Viewed

	@@ -0,0 +1,189 @@

+import os
+import torch
+from torchvision import transforms
+from PIL import Image
+import random
+from src.datasets.bbox_utils import plot_2d_bbox
+class BaseDataset:
+    def __init__(self,
+                 root='./datasets',
+                 train=True,
+                 clip_length=25,
+                 # orig_width=None, orig_height=None,
+                 resize_width=512, resize_height=320,
+                 non_overlapping_clips=False,
+                 bbox_masking_prob=0.0,
+                 sample_clip_from_end=True,
+                 ego_only=False,
+                 ignore_labels=False):
+        self.root = root
+        self.train = train
+        self.clip_length = clip_length
+        # self.orig_width = orig_width
+        # self.orig_height = orig_height
+        self.resize_width = resize_width
+        self.resize_height = resize_height
+        self.non_overlapping_clips = non_overlapping_clips
+        self.bbox_masking_prob = bbox_masking_prob
+        self.sample_clip_from_end = sample_clip_from_end
+        self.ego_only = ego_only
+        self.ignore_labels = ignore_labels
+        self.data_split = 'train' if self.train else 'val'
+        # Image transforms
+        self.transform = transforms.Compose([
+                         transforms.Resize((self.resize_height, self.resize_width)),
+                         transforms.ToTensor(),
+                         transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)), # map from [0,1] to [-1,1]
+                        ])
+        self.revert_transform = transforms.Compose([
+                         transforms.Normalize(mean=(-1, -1, -1), std=(2, 2, 2)),
+                        ])
+        self.image_files = []  # Contains the paths of all the images in the dataset
+        self.clip_list = []  # Contains a list of image indices for each clip
+        self.frame_labels = [] # For each image file, contains a list of dicts (labels of each object in the frame)
+        self.disable_cache = True
+        if self.disable_cache:
+            print("Bbox image caching disabled")
+    def __len__(self):
+        return len(self.clip_list)
+    def __getitem__(self, index):
+        return self._getclipitem(index)
+    def _getclipitem(self, index):
+        frames_indices = self.clip_list[index]
+        images, labels, bboxes, image_paths = [], [], [], []
+        masked_track_ids = self._get_masked_track_ids(frames_indices)
+        for frame_idx in frames_indices:
+            ret_dict = self._getimageitem(frame_idx, masked_track_ids=masked_track_ids)
+            images.append(ret_dict["image"])
+            labels.append(ret_dict["labels"])
+            bboxes.append(ret_dict["bbox_image"])
+            image_paths.append(ret_dict["image_path"])
+        images = torch.stack(images)
+        prompt = "" # NOTE: Currently not supporting prompts
+        action_type = 0  # Assume "normal" driving when unspecified
+        if hasattr(self, "action_type_list"):
+            action_type = self.action_type_list[index]
+        vid_name = self.image_files[frames_indices[0]].split("/")[-1].split(".")[0][:-5]
+        if not self.ignore_labels:
+            bboxes = torch.stack(bboxes)
+            # NOTE: Keys are plural because this makes more sense when batches get collated
+            ret_dict = {"clips": images,
+                        "prompts": prompt,
+                        "indices": index,
+                        "bbox_images": bboxes,
+                        "action_type": action_type,
+                        "vid_name": vid_name,
+                        "image_paths": image_paths
+                        }
+        else:
+            ret_dict = {"clips": images,
+                        "prompts": prompt,
+                        "indices": index}
+        return ret_dict
+    def _getimageitem(self, frame_index, masked_track_ids=None):
+        # Get the image
+        image_file = self.image_files[frame_index]
+        image = Image.open(image_file)
+        image = self.transform(image)
+        if not self.ignore_labels:
+            # Get the labels
+            labels = self.frame_labels[frame_index]
+            # Get the bbox image (from cache or draw new one)
+            image_filename = image_file.split('/')[-1].split('.')[0]
+            cache_filename = f"{image_filename}_bboxes"
+            cache_file = os.path.join(self.bbox_image_dir, f"{cache_filename}.jpg")
+            redraw_for_masked_agents = masked_track_ids is not None and len(masked_track_ids) > 0
+            if not os.path.exists(cache_file) or redraw_for_masked_agents or self.disable_cache:
+                bbox_im = self._draw_bbox(labels, cache_img_name=cache_filename, masked_track_ids=masked_track_ids, disable_cache=redraw_for_masked_agents or self.disable_cache)
+            else:
+                bbox_im = Image.open(cache_file)
+                bbox_im = self.transform(bbox_im)
+        else:
+            labels = None
+            bbox_im = None
+        ret_dict = {"image": image,
+                    "image_path": image_file,
+                    "labels": labels,
+                    "frame_index": frame_index,
+                    "bbox_image": bbox_im}
+        return ret_dict
+    def _draw_bbox(self, frame_labels, cache_img_name=None, masked_track_ids=None, disable_cache=False):
+        canvas = torch.zeros((3, self.orig_height, self.orig_width))
+        bbox_im = plot_2d_bbox(canvas, frame_labels, show_track_color=True, masked_track_ids=masked_track_ids)
+        transform = transforms.Compose([transforms.ToPILImage()])
+        bbox_pil = transform(bbox_im)
+        if cache_img_name is not None and not disable_cache:
+            if not os.path.exists(self.bbox_image_dir):
+                os.makedirs(self.bbox_image_dir, exist_ok=True)
+            image_path = os.path.join(self.bbox_image_dir, f"{cache_img_name}.jpg")
+            bbox_pil.save(image_path)
+            print("Cached bbox file:", image_path)
+        bbox_im = self.transform(bbox_pil)
+        return bbox_im
+    def _get_masked_track_ids(self, frames_indices):
+        masked_track_ids = []
+        if self.bbox_masking_prob > 0:
+            # Find all the trackIDs in the clip, randomly select some to mask and exclude from the bbox rendering
+            all_track_ids = set()
+            for frame_idx in frames_indices:
+                frame_labels = self.frame_labels[frame_idx] #self._parse_label(self.image_files[frame])
+                for label in frame_labels:
+                    track_id = label['track_id']
+                    if track_id not in all_track_ids and random.random() <= self.bbox_masking_prob:
+                        # Mask out this agent
+                        masked_track_ids.append(track_id)
+                    all_track_ids.add(label['track_id'])
+        return masked_track_ids
+    def get_frame_file_by_index(self, index, timestep=0):
+        frames = self.clip_list[index]
+        if timestep is None:
+            ret = []
+            for frame in frames:
+                ret.append(self.image_files[frame])
+            return ret
+        return self.image_files[frames[timestep]]
+    def get_bbox_image_file_by_index(self, index=None, image_file=None):
+        if image_file is None:
+            image_file = self.get_frame_file_by_index(index)
+        clip_name = image_file.split("/")[-2]
+        return image_file.replace(self.image_dir, self.bbox_image_dir).replace('/'+clip_name+'/', '/').replace(".jpg", "_bboxes.jpg")

src/datasets/bbox_utils.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import cv2
+import numpy as np
+from collections import defaultdict
+class CVCOLORS:
+    RED = (0,0,255)
+    GREEN = (0,255,0)
+    BLUE = (255,0,0)
+    PURPLE = (247,44,200)
+    ORANGE = (44,162,247)
+    MINT = (239,255,66)
+    YELLOW = (2,255,250)
+    BROWN = (42,42,165)
+    LIME=(51,255,153)
+    GRAY=(128, 128, 128)
+    LIGHTPINK = (222,209,255)
+    LIGHTGREEN = (204,255,204)
+    LIGHTBLUE = (255,235,207)
+    LIGHTPURPLE = (255,153,204)
+    LIGHTRED = (204,204,255)
+    WHITE = (255,255,255)
+    BLACK = (0,0,0)
+    TRACKID_LOOKUP = defaultdict(lambda: (np.random.randint(50, 255), np.random.randint(50, 255), np.random.randint(50, 255)))
+    TYPE_LOOKUP = [BROWN, BLUE, PURPLE, RED, ORANGE, YELLOW, LIGHTPINK, LIGHTPURPLE, GRAY, LIGHTRED, GREEN]
+    REVERT_CHANNEL_F = lambda x: (x[2], x[1], x[0])
+# TODO: This could be moved to base dataset class (?)
+def plot_2d_bbox(img, labels, show_track_color=False, channel_first=True, rgb2bgr=False, box_color=None, masked_track_ids=None, crash_border=False):
+    if channel_first:
+        img = img.permute((1, 2, 0)).detach().cpu().numpy().copy()*255
+    else:
+        img = img.detach().cpu().numpy().copy()*255
+    if rgb2bgr:
+        img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
+    masked_track_ids = masked_track_ids or []
+    for i, label_info in enumerate(labels):
+        track_id = label_info['track_id']
+        if track_id in masked_track_ids:
+            continue
+        box_2d = label_info['bbox']
+        if not show_track_color:
+            type_color_i = np.array(CVCOLORS.REVERT_CHANNEL_F(CVCOLORS.TYPE_LOOKUP[label_info['class_id']])) / 255 if box_color is None else box_color
+            track_color_i = CVCOLORS.REVERT_CHANNEL_F((1, 1, 1))
+            cv2.rectangle(img, (int(box_2d[0]), int(box_2d[1])), (int(box_2d[2]), int(box_2d[3])), type_color_i, cv2.FILLED)
+            cv2.rectangle(img, (int(box_2d[0]), int(box_2d[1])), (int(box_2d[2]), int(box_2d[3])), track_color_i, 2)
+        else:
+            type_color_i = np.array(CVCOLORS.REVERT_CHANNEL_F(CVCOLORS.TYPE_LOOKUP[label_info['class_id']])) / 255 if box_color is None else box_color
+            track_color_i = CVCOLORS.REVERT_CHANNEL_F(CVCOLORS.TRACKID_LOOKUP[label_info['track_id']])
+            dim = min(box_2d[2] - box_2d[0], box_2d[3] - box_2d[1])
+            b_thick = min(max(dim * 0.1, 2), 8)
+            cv2.rectangle(img, (int(box_2d[0]), int(box_2d[1])), (int(box_2d[2]), int(box_2d[3])), type_color_i, cv2.FILLED)
+            cv2.rectangle(img, (int(box_2d[0] + b_thick), int(box_2d[1] + b_thick)), (int(box_2d[2] - b_thick), int(box_2d[3] - b_thick)), track_color_i, cv2.FILLED)
+    if crash_border:
+        thickness = 20
+        cv2.rectangle(img, (0, 0), (img.shape[1], img.shape[0]), color=(0, 1, 0), thickness=thickness, lineType=cv2.LINE_8)
+    return img

src/datasets/bdd100k_dataset.py ADDED Viewed

	@@ -0,0 +1,185 @@

+from .base_dataset import BaseDataset
+from PIL import Image
+import os
+import json
+class BDD100KDataset(BaseDataset):
+    CLASS_NAME_TO_ID = {
+        'pedestrian': 1,
+        'rider': 2,
+        'car': 3,
+        'truck': 4,
+        'bus': 5,
+        'train': 6,
+        'motorcycle': 7,
+        'bicycle': 8,
+        'traffic light': 9,
+        'traffic sign': 10,
+    }
+    TO_COCO_LABELS = {
+        1: 0,
+        2: 0,
+        3: 2,
+        4: 7,
+        5: 5,
+        6: 6,
+    }
+    TO_IMAGE_DIR = 'images/track'
+    TO_BBOX_DIR = 'bboxes/track'
+    TO_LABEL_DIR = 'labels'
+    TO_BBOX_LABELS = 'labels/box_track_20'
+    TO_SEG_LABELS = 'labels/seg_track_20/colormaps'
+    TO_POSE_LABELS = 'labels/pose_21'
+    def __init__(self,
+                 root='./datasets',
+                 train=True,
+                 clip_length=25,
+                 #orig_height=720, orig_width=1280, # TODO: Define this (and use it)
+                 resize_height=320, resize_width=512,
+                 non_overlapping_clips=False,
+                 bbox_masking_prob=0.0,
+                 sample_clip_from_end=True,
+                 ego_only=False,
+                 ignore_labels=False,
+                 use_preplotted_bbox=True,
+                 specific_samples=None,
+                 specific_categories=None,
+                 force_clip_type=None):
+        super(BDD100KDataset, self).__init__(root=root,
+                                            train=train,
+                                            clip_length=clip_length,
+                                            resize_height=resize_height,
+                                            resize_width=resize_width,
+                                            non_overlapping_clips=non_overlapping_clips,
+                                            bbox_masking_prob=bbox_masking_prob,
+                                            sample_clip_from_end=sample_clip_from_end,
+                                            ego_only=ego_only,
+                                            ignore_labels=ignore_labels)
+        self.MAX_BOXES_PER_DATA = 30
+        self._location = 'train' if self.train else 'val'
+        self.version = 'bdd100k'
+        self.use_preplotted_bbox = use_preplotted_bbox
+        self.image_dir = os.path.join(self.root, self.version, BDD100KDataset.TO_IMAGE_DIR, self._location)
+        self.bbox_label_dir = os.path.join(self.root, self.version, BDD100KDataset.TO_BBOX_LABELS, self._location)
+        self.bbox_image_dir = os.path.join(self.root, self.version, BDD100KDataset.TO_BBOX_DIR, self._location)
+        if specific_categories is not None:
+            print("BDD100k does not support `specific_categories`")
+        if force_clip_type is not None:
+            print("BDD100k does not support `force_clip_type`")
+        self.specific_samples = specific_samples
+        if self.specific_samples is not None:
+            print("Only loading specific samples:", self.specific_samples)
+        listed_image_dir = os.listdir(self.image_dir)
+        try:
+            listed_image_dir.remove('pred')
+        except:
+            pass
+        self.clip_folders = sorted(listed_image_dir)
+        self.clip_folder_lengths = {k:len(os.listdir(os.path.join(self.image_dir, k))) for k in self.clip_folders}
+        for l in self.clip_folder_lengths.values():
+            assert l >= self.clip_length, f'clip length {self.clip_length} is too long for clip folder length {l}'
+        self._collect_clips()
+    def _collect_clips(self):
+        print("Collecting dataset clips...")
+        for clip_folder in self.clip_folders:
+            clip_path = os.path.join(self.image_dir, clip_folder)
+            clip_frames = sorted(os.listdir(clip_path))
+            if self.specific_samples is not None and clip_folder not in self.specific_samples:
+                continue
+            # Add all images to image_files
+            image_indices = []
+            for frame in clip_frames:
+                self.image_files.append(os.path.join(clip_path, frame))
+                image_indices.append(len(self.image_files)-1)
+            # Create clips of length clip_length
+            if self.clip_length is not None:
+                # Collect clips as overlapping clips (i.e. A video with 30 frames will yield 5 25-frame clips)
+                for start_image_idx in range(0, len(clip_frames) - self.clip_length + 1):
+                    end_image_idx = start_image_idx + self.clip_length
+                    clip_indices = image_indices[start_image_idx:end_image_idx]
+                    self.clip_list.append(clip_indices)
+    def _parse_label(self, label_file, frame_id):
+        target = []
+        with open(label_file, 'r') as f:
+            label = json.load(f)
+            frame_i = int(frame_id[-11:-4])-1
+            assert frame_id == label[frame_i]['name']
+            for obj in label[frame_i-1]['labels']:
+                if obj['category'] not in BDD100KDataset.CLASS_NAME_TO_ID:
+                    continue
+                target.append({
+                    'frame_name': frame_id,
+                    'track_id': int(obj['id']),
+                    'bbox': [obj['box2d']['x1'], obj['box2d']['y1'], obj['box2d']['x2'], obj['box2d']['y2']],
+                    'class_id': BDD100KDataset.CLASS_NAME_TO_ID[obj['category']],
+                    'class_name': obj['category'],
+                })
+                if len(target) >= self.MAX_BOXES_PER_DATA:
+                    break
+        return target
+    def _getimageitem(self, frame_index, masked_track_ids=None):
+        # Get the image
+        image_file = self.image_files[frame_index]
+        image = Image.open(image_file)
+        image = self.transform(image)
+        if not self.ignore_labels:
+            # Get the labels
+            clip_id = image_file[:image_file.rfind('/')]
+            clip_id = clip_id[clip_id.rfind('/')+1:]
+            label_file = os.path.join(self.bbox_label_dir, f'{clip_id}.json')
+            frame_id = image_file[image_file.rfind('/')+1:]
+            labels = self._parse_label(label_file, frame_id)
+            # Get the bbox image
+            if self.use_preplotted_bbox:
+                bbox_file = self.get_bbox_image_file_by_index(image_file=image_file)
+                bbox_im = Image.open(bbox_file)
+                bbox_im = self.transform(bbox_im)
+            else:
+                bbox_im = self._draw_bbox(labels, masked_track_ids=masked_track_ids)
+        else:
+            labels = None
+            bbox_im = None
+        ret_dict = {"image": image,
+                    "image_path": image_file,
+                    "labels": labels,
+                    "frame_index": frame_index,
+                    "bbox_image": bbox_im}
+        return ret_dict
+    def get_bbox_image_file_by_index(self, index=None, image_file=None):
+        if image_file is None:
+            image_file = self.get_image_file_by_index(index)
+        return image_file.replace(BDD100KDataset.TO_IMAGE_DIR, BDD100KDataset.TO_BBOX_DIR)
+    def get_image_file_by_index(self, index):
+        return self.image_files[index]
+    def __len__(self):
+        return len(self.clip_list) if self.clip_length is not None else len(self.image_files)
+if __name__ == "__init__":
+    dataset = BDD100KDataset()

src/datasets/dada2000_dataset.py ADDED Viewed

	@@ -0,0 +1,339 @@

+import os
+import json
+from tqdm import tqdm
+import csv
+from src.datasets.base_dataset import BaseDataset
+class DADA2000Dataset(BaseDataset):
+    CLASS_NAME_TO_ID = {
+            'person': 1,
+            'car': 3,
+            'truck': 4,
+            'bus': 5,
+            'train': 6,
+            'motorcycle': 7,
+            'bicycle': 8,
+        }
+    def __init__(self,
+                 root='./datasets',
+                 train=True,
+                 clip_length=25,
+                 orig_height=660, orig_width=1056,
+                 resize_height=320, resize_width=512,
+                 non_overlapping_clips=False,
+                 bbox_masking_prob=0.0,
+                 sample_clip_from_end=True,
+                 ego_only=False,
+                 specific_samples=None):
+        super(DADA2000Dataset, self).__init__(root=root,
+                                                 train=train,
+                                                 clip_length=clip_length,
+                                                 resize_height=resize_height,
+                                                 resize_width=resize_width,
+                                                 non_overlapping_clips=non_overlapping_clips,
+                                                 bbox_masking_prob=bbox_masking_prob,
+                                                 sample_clip_from_end=sample_clip_from_end,
+                                                 ego_only=ego_only)
+        self.dataset_name = "preprocess_dada2000"
+        self.orig_width = orig_width
+        self.orig_height = orig_height
+        self.image_dir = os.path.join(self.root, self.dataset_name, "images", self.data_split)
+        self.label_dir = os.path.join(self.root, self.dataset_name, "labels", self.data_split)
+        self.bbox_image_dir = os.path.join(self.root, self.dataset_name, "bbox_images", self.data_split)
+        self.metadata_csv_path = os.path.join(self.root, self.dataset_name, "metadata.csv") # TODO: This information could be transfered into each individual label file
+        self.strict_collision_filter = True
+        if self.strict_collision_filter:
+            print("Strict collision filter set for DADA2000")
+        self.specific_samples = specific_samples
+        if self.specific_samples is not None:
+            print("Only loading specific samples:", self.specific_samples)
+        self._collect_clips()
+    def _collect_clips(self):
+        accident_frame_metadata = {}
+        with open(self.metadata_csv_path) as csv_file:
+            csv_reader = csv.reader(csv_file)
+            for i, row in enumerate(csv_reader):
+                if i == 0:
+                    continue
+                video_num = row[0]
+                video_type = row[5]
+                abnormal_start_frame_idx = int(row[7])
+                accident_frame_idx = int(row[8])
+                abnormal_end_frame_idx = int(row[9])
+                video_name = f"{video_type}_{video_num.rjust(3, '0')}"
+                if accident_frame_idx == "-1":
+                    # print("Skipping video:", video_name)
+                    continue
+                # Need to convert the original frame idx to closest downsampled frame index
+                downsample_factor = 30/7 # Because we downsampled from 30fps to 7fps
+                accident_frame_metadata[video_name] = (int(abnormal_start_frame_idx / downsample_factor), int(accident_frame_idx / downsample_factor + 0.5), int(abnormal_end_frame_idx / downsample_factor + 0.5))
+        self.clip_type_list = [] # crash or normal or abnormal (abnormal is a scene that has abnormal driving but doesn't contain the actual crash moment)
+        image_indices_by_clip = {}
+        for label_file in sorted(os.listdir(self.label_dir)):
+            if not label_file.endswith('.json'):
+                continue
+            full_filename = os.path.join(self.label_dir, label_file)
+            with open(full_filename) as json_file:
+                all_data = json.load(json_file)
+                metadata = all_data['metadata']
+                if self.ego_only:
+                    print("Ego collisions only activated!")
+                    if metadata['ego_involved'] == False:
+                        continue
+                if self.strict_collision_filter and metadata["accident_type"] in [13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 25, 26, 28, 29, 31, 32, 34, 35, 36]:
+                    # Reject video where the collision is with "static" agents
+                    continue
+                # Some rejected clips
+                if all_data["video_source"] in ["10_001.mp4"]:
+                    continue
+                if self.specific_samples is not None and all_data["video_source"].split(".")[0] not in self.specific_samples:
+                    continue
+            clip_filename = label_file.split('.')[0]
+            clip_file = os.path.join(self.image_dir, clip_filename)
+            clip_data = all_data["data"]
+            clip_frames = sorted(os.listdir(clip_file))
+            num_frames = len(clip_frames)
+            if num_frames < self.clip_length:
+                # print(f"{clip_filename} does not have enough frames: has {num_frames} expected at least {self.clip_length}")
+                continue
+            accident_metadata = accident_frame_metadata.get(clip_filename)
+            if accident_metadata is None:
+                print(clip_filename, "no accident metadata found")
+                continue
+            # Frames within the abnormal range are considered accidents and outside are considered normal driving
+            # ab_start_idx, acc_idx, ab_end_idx = accident_metadata
+            # if ab_end_idx - ab_start_idx >= self.clip_length:
+            #     # We can just feed abnormal clip frames
+            #     clip_data = clip_data[ab_start_idx:ab_end_idx]
+            #     clip_frames = clip_frames[ab_start_idx:ab_end_idx]
+            #     num_frames = len(clip_frames)
+            # else:
+            #     # print(clip_filename, "no enough abnormal frames:", ab_end_idx - ab_start_idx)
+            #     continue
+            # #NOTE: Let's trim really long videos: videos over 75 frames will get the first frames trimmed
+            # if num_frames > 75:
+            #     # print("Long video:", clip_filename, len(num_frames), "frames")
+            #     frames_to_trim = num_frames - 75
+            #     clip_data = clip_data[frames_to_trim:]
+            #     clip_frames = clip_frames[frames_to_trim:]
+            clip_label_data = self._parse_clip_labels(clip_data)
+            self.frame_labels.extend(clip_label_data) # In this case labels are already sorted so they will match up to the image indices
+            image_indices_by_clip[clip_filename] = []
+            for image_file in clip_frames:
+                self.image_files.append(os.path.join(clip_file, image_file))
+                image_indices_by_clip[clip_filename].append(len(self.image_files)-1)
+            assert len(self.frame_labels) == len(self.image_files) # We assume a one-to-one association between images and labels
+            ab_start_idx, acc_idx, ab_end_idx = accident_metadata
+            def get_clip_type(image_idx, end_image_idx):
+                clip_type = "normal"
+                if image_idx <= acc_idx and end_image_idx > acc_idx:
+                    # Contains accident frame
+                    clip_type = "crash"
+                elif (image_idx >= ab_start_idx and image_idx <= ab_end_idx) or (end_image_idx > ab_start_idx and end_image_idx < ab_end_idx):
+                    # Does not contain accident frame, but contains abnormal driving (moment before and after accident)
+                    clip_type = "abnormal"
+                return clip_type
+            # Cut the videos in clips of the correct length according to the strategies chosen
+            if not self.non_overlapping_clips:
+                for image_idx in range(len(image_indices_by_clip[clip_filename]) - self.clip_length + 1):
+                    end_image_idx = image_idx+self.clip_length
+                    clip_type = get_clip_type(image_idx, end_image_idx)
+                    if clip_type == "abnormal":
+                        # Let's just reject the abnormal clips
+                        continue
+                    self.clip_list.append(image_indices_by_clip[clip_filename][image_idx:end_image_idx])
+                    self.clip_type_list.append(clip_type)
+            else:
+                if self.sample_clip_from_end:
+                    # In case self.clip_length << actual video sample length, we can create multiple non-overlapping clips for each sample
+                    # Prioritize selecting clips from the end, to make sur the accident is included (which tends to be at the end of the videos)
+                    total_frames = len(image_indices_by_clip[clip_filename])
+                    for clip_i in range(total_frames // self.clip_length):
+                        start_image_idx = total_frames - (self.clip_length * (clip_i + 1))
+                        end_image_idx = total_frames - (self.clip_length * clip_i)
+                        clip_type = get_clip_type(start_image_idx, end_image_idx)
+                        if clip_type == "abnormal":
+                            # Let's just reject the abnormal clips
+                            continue
+                        self.clip_list.append(image_indices_by_clip[clip_filename][start_image_idx:end_image_idx])
+                        self.clip_type_list.append(clip_type)
+                else:
+                    total_frames = len(image_indices_by_clip[clip_filename])
+                    for clip_i in range(total_frames // self.clip_length):
+                        start_image_idx = clip_i * self.clip_length
+                        end_image_idx = start_image_idx + self.clip_length
+                        clip_type = get_clip_type(start_image_idx, end_image_idx)
+                        if clip_type == "abnormal":
+                            # Let's just reject the abnormal clips
+                            continue
+                        self.clip_list.append(image_indices_by_clip[clip_filename][start_image_idx:end_image_idx])
+                        self.clip_type_list.append(clip_type)
+        print("Number of clips DADA2000:", len(self.clip_list), f"({self.data_split})")
+        crash_clip_count = 0
+        normal_clip_count = 0
+        for clip_type in self.clip_type_list:
+            if clip_type == "crash":
+                crash_clip_count += 1
+            elif clip_type == "normal":
+                normal_clip_count += 1
+        print(crash_clip_count, "crash clips", normal_clip_count, "normal clips")
+    def _parse_clip_labels(self, clip_data):
+        frame_labels = []
+        for frame_data in clip_data:
+            obj_data = frame_data['labels']
+            object_labels = []
+            for label in obj_data:
+                # Only keep the classes of interest
+                class_id = DADA2000Dataset.CLASS_NAME_TO_ID.get(label['name'])
+                if class_id is None:
+                    continue
+                # Convert bbox coordinates to pixel space wrt to image size
+                bbox = label['box']
+                bbox_coords_pixel = [int(bbox[0] * self.orig_width), # x1
+                                     int(bbox[1] * self.orig_height), # y1
+                                     int(bbox[2] * self.orig_width), # x2
+                                     int(bbox[3] * self.orig_height)] # y2
+                object_labels.append({
+                    'frame_name': frame_data["image_source"],
+                    'track_id': int(label['track_id']),
+                    'bbox': bbox_coords_pixel,
+                    'class_id': class_id,
+                    'class_name': label['name'], # Class name of the object
+                    })
+            frame_labels.append(object_labels)
+        return frame_labels
+def pre_cache_dataset(dataset_root):
+    # Trigger label and bbox image cache generation
+    from time import time
+    dataset_train = DADA2000Dataset(root=dataset_root, train=True, clip_length=25, non_overlapping_clips=False)
+    t = time()
+    for i in tqdm(range(len(dataset_train))):
+        d = dataset_train[i]
+        if i >= 100:
+            print("Time:", time() - t)
+            print("break")
+    dataset_val = DADA2000Dataset(root=dataset_root, train=False, clip_length=25, non_overlapping_clips=True)
+    for i in tqdm(range(len(dataset_val))):
+        d = dataset_val[i]
+    print("Done.")
+if __name__ == "__main__":
+    dataset_root = "/path/to/Datasets"
+    pre_cache_dataset(dataset_root)
+"""
+ACCIDENT TYPES
+{
+    "ego_car_involved": {
+        "self_initiated": {
+            "out_of_control": [61]
+        },
+        "dynamic_participants": {
+            "person_centric": {
+                "pedestrian": [1, 2],
+                "cyclist": [3, 4]
+            },
+            "vehicle_centric": {
+                "motorbike": [5, 6],
+                "truck": [7, 8, 9],
+                "car": [10, 11, 12]
+            }
+        },
+        "static_participants": {
+            "road_crentric": {
+                "large_roadblocks": [13],
+                "curb": [14],
+                "small_roadblocks_potholes": [15]
+            },
+            "other_semantics_centric": {
+                "trees": [16],
+                "telegraph_poles": [17],
+                "other_road_facilities": [18]
+            }
+        }
+    },
+    "ego_car_uninvolved": {
+        "dynamic_participants": {
+            "vehicle_centric": {
+                "motorbike_motorbike": [37, 38],
+                "truck_truck": [39, 40, 41],
+                "car_car": [42, 43, 44],
+                "motorbike_truck": [45, 46, 47],
+                "truck_car": [48, 49],
+                "car_motorbike": [50, 51]
+            },
+            "person_centric": [52, 53, 54, 55, 56, 57, 58, 59, 60]
+        },
+        "static_participants" : [19, 20, 21, 22, 25, 26, 28, 29, 31, 32, 34, 35, 36]
+    },
+    "summary": {
+        "ego_car_involved": {
+            "person_centric": [1, 2, 3, 4],
+            "vehicle_centric": [5, 6, 7, 8, 9, 10, 11, 12],
+            "static_participants": [13, 14, 15, 16, 17, 18],
+            "out_of_control": [61]
+        },
+        "ego_car_uninvolved": {
+            "static_participants": [19, 20, 21, 22, 25, 26, 28, 29, 31, 32, 34, 35, 36],
+            "vehicle_centric": [37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 50, 51],
+            "person_centric": [52, 53, 54, 55, 56, 57, 58, 59, 60]
+        }
+    }
+}
+"""

src/datasets/dataset_factory.py ADDED Viewed

	@@ -0,0 +1,36 @@

+from src.datasets.merged_dataset import MergedDataset
+from src.datasets.russia_crash_dataset import RussiaCrashDataset
+from src.datasets.dada2000_dataset import DADA2000Dataset
+from src.datasets.mmau_dataset import MMAUDataset
+from src.datasets.bdd100k_dataset import BDD100KDataset
+# from src.datasets.nuscenes_dataset import NuScenesDataset
+def create_dataset(dataset_name, **kwargs):
+    if str.lower(dataset_name) == "russia_crash":
+        dataset = RussiaCrashDataset(**kwargs)
+    elif str.lower(dataset_name) == "nuscenes":
+        dataset = NuScenesDataset(**kwargs)
+    elif str.lower(dataset_name) == "dada2000":
+        dataset = DADA2000Dataset(**kwargs)
+    elif str.lower(dataset_name) == "mmau":
+        dataset = MMAUDataset(**kwargs)
+    elif str.lower(dataset_name) == "bdd100k":
+        dataset = BDD100KDataset(**kwargs)
+    else:
+        raise NotImplementedError(f"Dataset '{dataset_name}' not implemented")
+    return dataset
+def dataset_factory(dataset_names, **kwargs):
+    if isinstance(dataset_names, str) or (isinstance(dataset_names, list) and len(dataset_names) == 1):
+        dataset_name = dataset_names[0] if isinstance(dataset_names, list) else dataset_names
+        # Init the single dataset
+        return create_dataset(dataset_name, **kwargs)
+    elif isinstance(dataset_names, list):
+        all_datasets = []
+        for dataset_name in dataset_names:
+            all_datasets.append(create_dataset(dataset_name, **kwargs))
+        return MergedDataset(all_datasets)

src/datasets/dataset_utils.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import torch
+import os
+from .dataset_factory import dataset_factory
+def worker_init_fn(worker_id):
+    os.sched_setaffinity(0, range(os.cpu_count()))
+def get_dataloader(dset_root,
+                   dset_names,
+                   if_train,
+                   batch_size,
+                   num_workers,
+                   clip_length=25,
+                   shuffle=True,
+                   image_height=None,
+                   image_width=None,
+                   non_overlapping_clips=False,
+                   ego_only=False,
+                   bbox_masking_prob=0.0,
+                   specific_samples=None,
+                   specific_categories=None,
+                   force_clip_type=None):
+    dataset = dataset_factory(dset_names,
+                              root=dset_root,
+                              train=if_train,
+                              clip_length=clip_length,
+                              resize_height=image_height,
+                              resize_width=image_width,
+                              non_overlapping_clips=non_overlapping_clips,
+                              bbox_masking_prob=bbox_masking_prob,
+                              ego_only=ego_only,
+                              specific_samples=specific_samples,
+                              specific_categories=specific_categories,
+                              force_clip_type=force_clip_type)
+    dataloader = torch.utils.data.DataLoader(
+        dataset,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        shuffle=shuffle,
+        pin_memory=True,
+        drop_last=True,
+        worker_init_fn=worker_init_fn
+    )
+    return dataset, dataloader

src/datasets/merged_dataset.py ADDED Viewed

	@@ -0,0 +1,54 @@

+from torchvision import transforms
+class MergedDataset:
+    """
+    Dataset wrapper to access many datasets as one
+    """
+    def __init__(self, dataset_list):
+        self.dataset_list = dataset_list
+        # TODO: Make sure this matches all datasets
+        self.resize_width = self.dataset_list[0].resize_width
+        self.resize_height = self.dataset_list[0].resize_height
+        self.revert_transform = self.dataset_list[0].revert_transform
+        print("TOTAL number of clips in merged dataset:", self.__len__(), f"({self.dataset_list[0].data_split})")
+    def __len__(self):
+        return sum([len(dset) for dset in self.dataset_list])
+    def __getitem__(self, global_index):
+        target_dset, rel_index = self.get_dataset_by_sample_index(global_index)
+        ret_dict = target_dset.__getitem__(rel_index)
+        # Overwrite returned index with the global index
+        ret_dict["indices"] = global_index
+        return ret_dict
+    def get_dataset_by_sample_index(self, index):
+        total_idx = 0
+        target_dset = None
+        for dset in self.dataset_list:
+            total_idx += len(dset)
+            if index < total_idx:
+                target_dset = dset
+                break
+        return target_dset, (index - (total_idx - len(target_dset)))
+    def get_frame_file_by_index(self, index, timestep=None):
+        target_dset, rel_index = self.get_dataset_by_sample_index(index)
+        return target_dset.get_frame_file_by_index(rel_index, timestep=timestep)
+    def get_bbox_image_file_by_index(self, index, image_file=None):
+        target_dset, rel_index = self.get_dataset_by_sample_index(index)
+        return target_dset.get_bbox_image_file_by_index(index=rel_index)

src/datasets/mmau_dataset.py ADDED Viewed

	@@ -0,0 +1,549 @@

+import os
+import json
+from tqdm import tqdm
+import csv
+import json
+import cv2
+from src.datasets.base_dataset import BaseDataset
+def load_json(filename):
+    if os.path.exists(filename):
+        with open(filename, "r") as f:
+            return json.load(f)
+    print(filename, "not found")
+    return []
+def create_video_from_images(images_list, output_video, out_fps, start_frame=None, end_frame=None):
+    img0_path = images_list[0]
+    img0 = cv2.imread(img0_path)
+    height, width, _ = img0.shape
+    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+    out = cv2.VideoWriter(output_video, fourcc, out_fps, (width, height))
+    for idx, frame_name in enumerate(images_list):
+        if start_frame is not None and idx < start_frame:
+            continue
+        if end_frame is not None and idx >= end_frame:
+            continue
+        img = cv2.imread(frame_name)
+        out.write(img)
+    out.release()
+    print("Saved video:", output_video)
+class MMAUDataset(BaseDataset):
+    CLASS_NAME_TO_ID = {
+            'person': 1,
+            'car': 3,
+            'truck': 4,
+            'bus': 5,
+            'train': 6,
+            'motorcycle': 7,
+            'bicycle': 8,
+        }
+    def __init__(self,
+                 root='./datasets',
+                 train=True,
+                 clip_length=25,
+                 orig_height=640, orig_width=1024,
+                 resize_height=320, resize_width=512,
+                 non_overlapping_clips=False,
+                 bbox_masking_prob=0.0,
+                 sample_clip_from_end=True,
+                 ego_only=False,
+                 specific_samples=None,
+                 specific_categories=None,
+                 dada_only=False,
+                 cleanup_dataset=False,
+                 force_clip_type=None
+                 ):
+        self.ignore_labels = False
+        if self.ignore_labels:
+            print("IGNORING LABELS in MMAU dataset")
+        super(MMAUDataset, self).__init__(root=root,
+                                            train=train,
+                                            clip_length=clip_length,
+                                            resize_height=resize_height,
+                                            resize_width=resize_width,
+                                            non_overlapping_clips=non_overlapping_clips,
+                                            bbox_masking_prob=bbox_masking_prob,
+                                            sample_clip_from_end=sample_clip_from_end,
+                                            ego_only=ego_only,
+                                            ignore_labels=self.ignore_labels) # NOTE: Ignoring labels currently
+        self.dada_only = dada_only
+        self.cleanup_dataset = cleanup_dataset
+        self.dataset_name = "mmau_images_12fps" if not dada_only else "dada2000_images_12fps"
+        self.orig_width = orig_width
+        self.orig_height = orig_height
+        self.split = "train" if train else "val"
+        self.image_dir = os.path.join(self.root, self.dataset_name, "images")
+        self.label_dir = os.path.join(self.root, self.dataset_name, "labels")
+        self.bbox_image_dir = os.path.join(self.root, self.dataset_name, "bbox_images")
+        self.downsample_6fps = True
+        if self.downsample_6fps:
+            print("Downsampling MMAU clips to 6 fps")
+        self.ego_only = ego_only
+        if self.ego_only:
+            print("Ego collisions only filter set for MMAU dataset")
+        self.strict_collision_filter = False
+        if self.strict_collision_filter:
+            print("Strict collision filter set for MMAU dataset")
+        self.specific_samples = specific_samples
+        if self.specific_samples is not None:
+            print("Only loading specific samples:", self.specific_samples)
+        self.specific_categories = specific_categories
+        if self.specific_categories is not None:
+            print("Only loading specific categories:", self.specific_categories)
+        self.force_clip_type = force_clip_type
+        if self.force_clip_type is not None:
+            print("Only loading samples with type:", force_clip_type)
+        self._collect_clips()
+    def _collect_metadata_csv(self, metadata_csv_path):
+        accident_frame_metadata = {}
+        with open(metadata_csv_path) as csv_file:
+            csv_reader = csv.reader(csv_file)
+            for i, row in enumerate(csv_reader):
+                if i == 0:
+                    continue
+                video_num = str(int(row[0]))
+                video_type = str(int(row[5]))
+                abnormal_start_frame_idx = int(row[7])
+                accident_frame_idx = int(row[9])
+                abnormal_end_frame_idx = int(row[8])
+                video_name = f"{video_type}_{video_num.rjust(5, '0')}"
+                if accident_frame_idx == "-1":
+                    # print("Skipping video:", video_name)
+                    continue
+                downsample_factor = 30/12 if video_num.startswith("90") else 1  # Downsample for DADA (30fps) and CAP (12 fps) are not the same
+                if self.downsample_6fps:
+                    downsample_factor *= 2
+                accident_frame_metadata[video_name] = (int(abnormal_start_frame_idx / downsample_factor),
+                                                       int(accident_frame_idx / downsample_factor + 0.5),
+                                                       int(abnormal_end_frame_idx / downsample_factor + 0.5))
+        return accident_frame_metadata
+    def _collect_clips(self):
+        print("Collecting dataset clips...")
+        mmau_dataset = os.path.join(self.root, self.dataset_name)
+        # Load data split
+        datasplit_data = load_json(os.path.join(mmau_dataset, "mmau_datasplit.json"))
+        # Compile reject videos
+        auto_filtered_vids = load_json(os.path.join(mmau_dataset, "auto_low_quality.json"))
+        rejected_vids = load_json(os.path.join(mmau_dataset, "rejected.json"))
+        all_rejected_vids = auto_filtered_vids + rejected_vids
+        # Collect the accident moment information
+        accident_frame_metadata = self._collect_metadata_csv(os.path.join(mmau_dataset, "mmau_metadata.csv"))
+        self.clip_type_list = [] # crash or normal or abnormal (abnormal is a scene that has abnormal driving but doesn't contain the actual crash moment)
+        self.action_type_list = [] # 0-4, 0 is normal, 1-4 are different types of crashes
+        image_indices_by_clip = {}
+        null_labels = []
+        # Iterate datasplit file
+        count_vid = 0
+        for category, split in datasplit_data.items():
+            for split_name, vid_names in split.items():
+                if split_name != self.split:
+                    continue
+                for vid_name in vid_names:
+                    if vid_name in all_rejected_vids:
+                        continue
+                    if self.dada_only and not vid_name.split("_")[-1].startswith("90"): # NOTE: REMOVE THIS
+                        continue
+                    # Read image files
+                    image_dir = os.path.join(mmau_dataset, "images")
+                    clip_file = os.path.join(image_dir, category, vid_name)
+                    clip_frames = sorted(os.listdir(clip_file))
+                    if self.cleanup_dataset:
+                        # NOTE: For renaming frames (can remove this later)
+                        fix_label = False
+                        for frame_name in clip_frames:
+                            if vid_name not in frame_name:
+                                fix_label = True
+                                new_frame_name = f"{vid_name}_{frame_name}"
+                                root_path = os.path.join(mmau_dataset, "images", category, vid_name)
+                                os.rename(os.path.join(root_path, frame_name), os.path.join(root_path, new_frame_name))
+                        image_dir = os.path.join(mmau_dataset, "images")
+                        clip_file = os.path.join(image_dir, category, vid_name)
+                        clip_frames = sorted(os.listdir(clip_file))
+                        # Also rename in label file
+                        label_file_path = os.path.join(self.label_dir, f"{vid_name}.json")
+                        if os.path.exists(label_file_path) and fix_label:
+                            with open(label_file_path, "r") as f:
+                                data = json.load(f)
+                                data_field = data["data"]
+                                if data_field is None:
+                                    print(f"{vid_name}.json CLIP DATA IS NULL 2")
+                                    null_labels.append(vid_name)
+                                else:
+                                    for i, frame_data in enumerate(data_field):
+                                        current_frame_name = frame_data["image_source"]
+                                        if vid_name not in current_frame_name:
+                                            new_frame_name = f"{vid_name}_{current_frame_name}"
+                                            data["data"][i]["image_source"] = new_frame_name
+                            with open(label_file_path, "w") as f:
+                                json.dump(data, f, indent=1)
+                    num_frames = len(clip_frames) if not self.downsample_6fps else len(clip_frames) // 2
+                    if num_frames < self.clip_length:
+                        print(f"{vid_name} does not have enough frames: has {num_frames}, expected at least {self.clip_length}")
+                        continue
+                    accident_metadata = accident_frame_metadata.get(vid_name)
+                    if accident_metadata is None:
+                        print(vid_name, "no accident metadata found")
+                        continue
+                    step = 2 if self.downsample_6fps else 1
+                    clip_frame_names = []
+                    for image_idx in range(0, len(clip_frames), step):
+                        image_file = clip_frames[image_idx]
+                        clip_frame_names.append(image_file)
+                    count_vid += 1
+                    # Read label file
+                    if not self.ignore_labels:
+                        label_file_path = os.path.join(self.label_dir, f"{vid_name}.json")
+                        if not os.path.exists(label_file_path):
+                            if num_frames <= 300:
+                                # Because a lot of the long videos were rejected because they were too long to process
+                                # print(f"{label_file_path} does not exist")
+                                pass
+                            continue
+                        with open(label_file_path) as json_file:
+                            all_data = json.load(json_file)
+                            metadata = all_data['metadata']
+                            if self.ego_only:
+                                if metadata['ego_involved'] == False:
+                                    continue
+                            if self.strict_collision_filter and metadata["accident_type"] in [13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 25, 26, 28, 29, 31, 32, 34, 35, 36]:
+                                # Reject video where the collision is with "static" agents
+                                continue
+                            # Some post-hoc rejected clips
+                            if all_data["video_source"] in ["10_90001.mp4"]:
+                                continue
+                            if self.specific_samples is not None and all_data["video_source"].split(".")[0] not in self.specific_samples:
+                                continue
+                            if self.specific_categories is not None and metadata["accident_type"] not in self.specific_categories:
+                                continue
+                            clip_data = all_data["data"]
+                            if clip_data is None:
+                                print(f"{vid_name}.json CLIP DATA IS NULL")
+                                null_labels.append(vid_name)
+                                continue
+                        clip_label_data = self._parse_clip_labels(clip_data, clip_frame_names)
+                        self.frame_labels.extend(clip_label_data) # In this case, labels are already sorted so they will match up to the image indices
+                    image_indices_by_clip[vid_name] = []
+                    for image_file in clip_frame_names:
+                        self.image_files.append(os.path.join(clip_file, image_file))
+                        image_indices_by_clip[vid_name].append(len(self.image_files)-1)
+                    if not self.ignore_labels:
+                        assert len(self.frame_labels) == len(self.image_files), f"{len(self.frame_labels)} frame labels != {len(self.image_files)} image files" # We assume a one-to-one association between images and labels
+                    ab_start_idx, acc_idx, ab_end_idx = accident_metadata
+                    def get_clip_type(image_idx, end_image_idx):
+                        clip_type = "normal"
+                        if image_idx <= acc_idx and end_image_idx >= acc_idx:
+                            # Contains accident frame
+                            clip_type = "crash"
+                        elif (image_idx >= ab_start_idx and image_idx <= ab_end_idx) \
+                            or (end_image_idx >= ab_start_idx and end_image_idx <= ab_end_idx) \
+                            or image_idx > acc_idx: # Let's also consider "normal" driving clip that happen after the accident to be "abnormal" as they might show the aftermath (e.g. car damage)
+                            # Does not contain accident frame, but contains abnormal driving (moment before and after accident)
+                            clip_type = "abnormal"
+                        return clip_type
+                    # Cut the videos in clips of the correct length
+                    # NOTE: Only implementing strategy of selecting two clips per video: 1 with normal driving and 1 with crash
+                    #   Select normal driving from beginning preferably and crash clip try to center it on the accident instant
+                    # Find crash clip
+                    crash_found = False
+                    if self.force_clip_type is None or self.force_clip_type == "crash":
+                        start_image_idx, end_image_idx = None, None
+                        total_frames = len(image_indices_by_clip[vid_name])
+                        if acc_idx is not None and self.clip_length is not None:
+                            # Keep frame_count frames around accident frame
+                            start_image_idx = acc_idx - int(self.clip_length/2 + 0.5)
+                            end_image_idx = acc_idx + int(self.clip_length/2)
+                            if total_frames < self.clip_length:
+                                print(f"Not enough frames in '{vid_name}': {total_frames}, skipping")
+                            else:
+                                if start_image_idx < 0:
+                                    end_image_idx += -(start_image_idx)
+                                    start_image_idx = 0
+                                if end_image_idx > total_frames:
+                                    start_image_idx -= (end_image_idx - total_frames)
+                                    end_image_idx = total_frames
+                                self.clip_list.append(image_indices_by_clip[vid_name][start_image_idx:end_image_idx])
+                                self.clip_type_list.append("crash")
+                                action_type = self._get_action_type(metadata["accident_type"])
+                                self.action_type_list.append(action_type)
+                                crash_found = True
+                                # Debug: #############
+                                # frame_path_list = [self.image_files[i] for i in image_indices_by_clip[vid_name][start_image_idx:end_image_idx]]
+                                # create_video_from_images(frame_path_list, f"outputs/sample_clip_{vid_name}_crash.mp4", out_fps=6 if self.downsample_6fps else 12)
+                                # # Debug plot bboxes:
+                                # out_bbox_path = os.path.join("outputs", f"{vid_name}_bboxes_crash")
+                                # os.makedirs(out_bbox_path, exist_ok=True)
+                                # for frame_path, label_data in zip(frame_path_list, clip_label_data[start_image_idx:end_image_idx]):
+                                #     plt.figure(figsize=(9, 6))
+                                #     plt.axis("off")
+                                #     img = Image.open(frame_path)
+                                #     plt.imshow(img)
+                                #     for obj in label_data:
+                                #         color = np.array(CVCOLORS.REVERT_CHANNEL_F(CVCOLORS.TYPE_LOOKUP[obj["class_id"]])) / 255.0
+                                #         show_box(obj["bbox"], plt.gca(), label=str(obj["track_id"]), color=color)
+                                #     frame_id_name = frame_path.split("_")[-1].split(".")[0]
+                                #     plt.savefig(os.path.join(out_bbox_path, f"bboxes_frame_{frame_id_name}.jpg"))
+                                #######################3
+                        if not crash_found:
+                            print("Crash not found for", vid_name)
+                        assert end_image_idx > start_image_idx
+                    if self.force_clip_type is None or self.force_clip_type == "normal":
+                        normal_found = False
+                        for start_image_idx in range(len(image_indices_by_clip[vid_name]) - self.clip_length + 1):
+                            end_image_idx = start_image_idx+self.clip_length
+                            clip_type = get_clip_type(start_image_idx, end_image_idx)
+                            if clip_type == "abnormal" or clip_type == "crash":
+                                # Let's just reject the abnormal clips
+                                continue
+                            self.clip_list.append(image_indices_by_clip[vid_name][start_image_idx:end_image_idx])
+                            self.clip_type_list.append(clip_type)
+                            self.action_type_list.append(0)
+                            normal_found = True
+                            # Debug: ########
+                            # frame_path_list = [self.image_files[i] for i in image_indices_by_clip[vid_name][start_image_idx:end_image_idx]]
+                            # create_video_from_images(frame_path_list, f"outputs/sample_clip_{vid_name}_normal.mp4", out_fps=6 if self.downsample_6fps else 12)
+                            # out_bbox_path = os.path.join("outputs", f"{vid_name}_bboxes_normal")
+                            # os.makedirs(out_bbox_path, exist_ok=True)
+                            # for frame_path, label_data in zip(frame_path_list, clip_label_data[start_image_idx:end_image_idx]):
+                            #     plt.figure(figsize=(9, 6))
+                            #     plt.axis("off")
+                            #     img = Image.open(frame_path)
+                            #     plt.imshow(img)
+                            #     for obj in label_data:
+                            #         color = np.array(CVCOLORS.REVERT_CHANNEL_F(CVCOLORS.TYPE_LOOKUP[obj["class_id"]])) / 255.0
+                            #         show_box(obj["bbox"], plt.gca(), label=str(obj["track_id"]), color=color)
+                            #     frame_id_name = frame_path.split("_")[-1].split(".")[0]
+                            #     plt.savefig(os.path.join(out_bbox_path, f"bboxes_frame_{frame_id_name}.jpg"))
+                            #################
+                            break
+                        # if not normal_found:
+                        # print("Normal not found for", vid_name)
+                    assert len(self.clip_list) == len(self.clip_type_list) == len(self.action_type_list)
+        print("Number of clips MMAU:", len(self.clip_list), f"({self.data_split})", f"(from {count_vid} original videos)")
+        crash_clip_count = 0
+        normal_clip_count = 0
+        for clip_type in self.clip_type_list:
+            if clip_type == "crash":
+                crash_clip_count += 1
+            elif clip_type == "normal":
+                normal_clip_count += 1
+        print(crash_clip_count, "crash clips", normal_clip_count, "normal clips")
+        if self.cleanup_dataset and len(null_labels) > 0:
+            print("Null labels:", null_labels)
+            for label_name in null_labels:
+                label_file_path = os.path.join(self.label_dir, f"{label_name}.json")
+            if os.path.exists(label_file_path):
+                os.remove(label_file_path)
+                print("Removed label file:", label_file_path)
+    def _parse_clip_labels(self, clip_data, clip_frame_names):
+        frame_labels = []
+        for frame_data in clip_data:
+            obj_data = frame_data['labels']
+            image_source = frame_data["image_source"]
+            if self.downsample_6fps and image_source not in clip_frame_names:
+                # Only preserve even numbered frames
+                continue
+            object_labels = []
+            for label in obj_data:
+                # Only keep the classes of interest
+                class_id = MMAUDataset.CLASS_NAME_TO_ID.get(label['name'])
+                if class_id is None:
+                    continue
+                # Convert bbox coordinates to pixel space wrt to image size
+                bbox = label['box']
+                bbox_coords_pixel = [int(bbox[0] * self.orig_width), # x1
+                                     int(bbox[1] * self.orig_height), # y1
+                                     int(bbox[2] * self.orig_width), # x2
+                                     int(bbox[3] * self.orig_height)] # y2
+                object_labels.append({
+                    'frame_name': image_source,
+                    'track_id': int(label['track_id']),
+                    'bbox': bbox_coords_pixel,
+                    'class_id': class_id,
+                    'class_name': label['name'], # Class name of the object
+                    })
+            frame_labels.append(object_labels)
+        return frame_labels
+    def _get_action_type(self, accident_type):
+        # [0: normal, 1: ego, 2: ego/veh, 3: veh, 4: veh/veh]
+        accident_type = int(accident_type)
+        if accident_type in [61, 62, 13, 14, 15, 16, 17, 18]:
+            return 1
+        elif accident_type in range(1, 12 + 1):
+            return 2
+        elif accident_type in [37, 39, 41, 42, 44] + list(range(19, 36 + 1)) + list(range(52, 60 + 1)):
+            return 3
+        elif accident_type in [38, 40, 43, 45, 46, 47, 48, 49, 50, 51]:
+            return 4
+        else:
+            raise ValueError(f"Unknown accident type: {accident_type}")
+def pre_cache_dataset(dataset_root):
+    # dset = MMAUDataset(dataset_root, train=False, cleanup_dataset=True, specific_categories=["42"])
+    dset = MMAUDataset(dataset_root, train=False, cleanup_dataset=True)
+    # s = dset.__getitem__(0)
+    # dset = MMAUDataset(dataset_root, train=False, cleanup_dataset=True)
+    # s = dset.__getitem__(0)
+    # Trigger label and bbox image cache generation
+    # from time import time
+    # dataset_train = DADA2000Dataset(root=dataset_root, train=True, clip_length=25, non_overlapping_clips=False)
+    # t = time()
+    # for i in tqdm(range(len(dataset_train))):
+    #     d = dataset_train[i]
+    #     if i >= 100:
+    #         print("Time:", time() - t)
+    #         print("break")
+    # dataset_val = DADA2000Dataset(root=dataset_root, train=False, clip_length=25, non_overlapping_clips=True)
+    # for i in tqdm(range(len(dataset_val))):
+    #     d = dataset_val[i]
+    # print("Done.")
+if __name__ == "__main__":
+    dataset_root = "/path/to/Datasets"
+    pre_cache_dataset(dataset_root)
+    MMAUDataset(dataset_root, train=True)
+"""
+ACCIDENT TYPES
+{
+    "ego_car_involved": {
+        "self_initiated": {
+            "out_of_control": [61]
+        },
+        "dynamic_participants": {
+            "person_centric": {
+                "pedestrian": [1, 2],
+                "cyclist": [3, 4]
+            },
+            "vehicle_centric": {
+                "motorbike": [5, 6],
+                "truck": [7, 8, 9],
+                "car": [10, 11, 12]
+            }
+        },
+        "static_participants": {
+            "road_crentric": {
+                "large_roadblocks": [13],
+                "curb": [14],
+                "small_roadblocks_potholes": [15]
+            },
+            "other_semantics_centric": {
+                "trees": [16],
+                "telegraph_poles": [17],
+                "other_road_facilities": [18]
+            }
+        }
+    },
+    "ego_car_uninvolved": {
+        "dynamic_participants": {
+            "vehicle_centric": {
+                "motorbike_motorbike": [37, 38],
+                "truck_truck": [39, 40, 41],
+                "car_car": [42, 43, 44],
+                "motorbike_truck": [45, 46, 47],
+                "truck_car": [48, 49],
+                "car_motorbike": [50, 51]
+            },
+            "person_centric": [52, 53, 54, 55, 56, 57, 58, 59, 60]
+        },
+        "static_participants" : [19, 20, 21, 22, 25, 26, 28, 29, 31, 32, 34, 35, 36]
+    },
+}
+"""

src/datasets/nuscenes_dataset.py ADDED Viewed

	@@ -0,0 +1,298 @@

+from nuscenes.nuscenes import NuScenes
+from nuscenes.utils.geometry_utils import view_points
+import numpy as np
+from pyquaternion import Quaternion
+import os
+from typing import Tuple
+from nuscenes.utils.splits import create_splits_scenes
+from shapely.geometry import MultiPoint, box
+from typing import List, Tuple, Union
+import json
+from tqdm import tqdm
+from src.datasets.base_dataset import BaseDataset
+# "Singleton" that holds the data so we only have to load once for training & validation
+nusc_data = None
+class NuScenesDataset(BaseDataset):
+    CLASS_NAME_TO_ID = {
+        "animal": 1,
+        "human.pedestrian.adult": 1,
+        "human.pedestrian.child":  1,
+        "human.pedestrian.construction_worker": 1,
+        "human.pedestrian.personal_mobility": 1,
+        "human.pedestrian.police_officer": 1,
+        "human.pedestrian.stroller": 1,
+        "human.pedestrian.wheelchair": 1,
+        # "movable_object.barrier": 10,
+        # "movable_object.debris":  10,
+        # "movable_object.pushable_pullable": 10,
+        # "movable_object.trafficcone": 10,
+        # "static_object.bicycle_rack":  10,
+        "vehicle.bicycle":  8,
+        "vehicle.bus.bendy":  5,
+        "vehicle.bus.rigid":  5,
+        "vehicle.car": 3,
+        "vehicle.emergency.police": 3,
+        "vehicle.construction":  4,
+        "vehicle.emergency.ambulance": 4,
+        "vehicle.trailer":  4,
+        "vehicle.truck":  4,
+        "vehicle.motorcycle":  7,
+        "None": 10,
+    }
+    def __init__(self,
+                 root='./datasets',
+                 train=True,
+                 clip_length=25,
+                 orig_height=900, orig_width=1600,
+                 resize_height=320, resize_width=512,
+                 non_overlapping_clips=False,
+                 bbox_masking_prob=0.0,
+                 test_split=False,
+                 ego_only=False):
+        super(NuScenesDataset, self).__init__(root=root,
+                                                train=train,
+                                                clip_length=clip_length,
+                                                resize_height=resize_height,
+                                                resize_width=resize_width,
+                                                non_overlapping_clips=non_overlapping_clips,
+                                                bbox_masking_prob=bbox_masking_prob)
+        self.dataset_name = 'nuscenes'
+        self.train = train
+        self.orig_width = orig_width
+        self.orig_height = orig_height
+        self.non_overlapping_clips = non_overlapping_clips
+        self.bbox_image_dir = os.path.join(self.root, self.dataset_name, "bbox_images", self.data_split)
+        self.label_dir = os.path.join(self.root, self.dataset_name, "labels", self.data_split)
+        os.makedirs(self.label_dir, exist_ok=True)
+        self.inst_token_to_track_id = {}
+        split_scenes = self._load_nusc(test_split)
+        self._collect_clips(split_scenes)
+    def _load_nusc(self, test_split):
+        global nusc_data
+        if nusc_data is None:
+            data_split = 'v1.0-trainval' if not test_split else 'v1.0-test'
+            # data_split = 'v1.0-mini' # Or: 'v1.0-mini' for testing
+            nusc_data = NuScenes(version=data_split,
+                                dataroot=os.path.join(self.root, self.dataset_name),
+                                verbose=True)
+        self.nusc = nusc_data
+        dataset_split = 'train' if self.train else 'val'
+        if test_split:
+            dataset_split = 'test'
+        split_scene_names = create_splits_scenes()[dataset_split]  # [train: 700, val: 150, test: 150]
+        split_scenes = [scene for scene in nusc_data.scene if scene['name'] in split_scene_names]
+        return split_scenes
+    def _collect_clips(self, split_scenes):
+        image_indices_by_scene = {}
+        def collect_frame(scene_idx, sample_data):
+            # Get image
+            image_path = os.path.join(self.root, self.dataset_name, sample_data['filename'])
+            self.image_files.append(image_path)
+            if image_indices_by_scene.get(scene_idx) is None:
+                image_indices_by_scene[scene_idx] = []
+            image_indices_by_scene[scene_idx].append(len(self.image_files) - 1)
+            # Parse label
+            labels = self._parse_label(sample_data["token"])
+            self.frame_labels.append(labels)
+        # Interpolating annotations to increase the frame rate (nuscenes annotation fps=2Hz, video data fps=12Hz)
+        self.fps = 7
+        target_period = 1/self.fps # For fps downsampling
+        max_frames_per_scene = 75
+        print("Collecting nuscenes clips...")
+        for scene_i, scene in enumerate(split_scenes):
+            curr_data_token = self.nusc.get('sample', scene['first_sample_token'])['data']["CAM_FRONT"]
+            curr_sample_data = self.nusc.get('sample_data', curr_data_token)
+            collect_frame(scene_i, curr_sample_data)
+            cumul_delta = 0
+            total_delta = 0
+            t = 0
+            while curr_data_token:
+                curr_sample_data = self.nusc.get('sample_data', curr_data_token)
+                next_sample_data_token = curr_sample_data['next']
+                if not next_sample_data_token:
+                    break
+                next_sample_data = self.nusc.get('sample_data', next_sample_data_token)
+                # FPS downsampling: only select certain frames based on elapsed times
+                delta = (next_sample_data['timestamp'] - curr_sample_data['timestamp']) / 1e6
+                cumul_delta += delta
+                total_delta += delta
+                if cumul_delta >= target_period:
+                    collect_frame(scene_i, next_sample_data)
+                    t += 1
+                    cumul_delta = cumul_delta - target_period
+                curr_data_token = next_sample_data_token
+                if len(image_indices_by_scene[scene_i]) > max_frames_per_scene:
+                    break
+            # print(f"Fps: {len(image_indices_by_scene[scene_i]) / total_delta:.4f}")
+            if not self.non_overlapping_clips:
+                for image_idx in range(len(image_indices_by_scene[scene_i]) - self.clip_length + 1):
+                    self.clip_list.append(image_indices_by_scene[scene_i][image_idx:image_idx+self.clip_length])
+            else:
+                # In case self.clip_length << actual video sample length (~20s), we can create multiple non-overlapping clips for each sample
+                total_frames = len(image_indices_by_scene[scene_i])
+                for clip_i in range(total_frames // self.clip_length):
+                    start_image_idx = clip_i * self.clip_length
+                    self.clip_list.append(image_indices_by_scene[scene_i][start_image_idx:start_image_idx+self.clip_length])
+        print("Number of nuScenes clips:", len(self.clip_list), f"({'train' if self.train else 'val'})")
+    def _parse_label(self, token):
+        cam_front_data = self.nusc.get('sample_data', token)
+        # Check cache, if it doesn't exist, then create label file
+        filename = cam_front_data["filename"].split('/')[-1].split('.')[0]
+        label_file_path = os.path.join(self.label_dir, f"{filename}.json")
+        if os.path.exists(label_file_path):
+            with open(label_file_path, 'r') as json_file:
+                object_labels = json.load(json_file)
+            return object_labels
+        else:
+            front_camera_sensor = self.nusc.get('calibrated_sensor', cam_front_data['calibrated_sensor_token'])
+            camera_intrinsic = np.array(front_camera_sensor['camera_intrinsic'])
+            ego_pose = self.nusc.get('ego_pose', cam_front_data['ego_pose_token'])
+            object_labels = []
+            bbox_center_by_track_id = {}
+            for bbox_3d in self.nusc.get_boxes(token):
+                class_name = bbox_3d.name
+                if class_name not in NuScenesDataset.CLASS_NAME_TO_ID:
+                    continue
+                class_id = NuScenesDataset.CLASS_NAME_TO_ID[class_name]
+                instance_token = self.nusc.get('sample_annotation', bbox_3d.token)['instance_token']
+                if instance_token not in self.inst_token_to_track_id:
+                    self.inst_token_to_track_id[instance_token] = len(self.inst_token_to_track_id)
+                # Project 3D bboxes to 2D
+                # (Code adapted from: https://github.com/nutonomy/nuscenes-devkit/blob/master/python-sdk/nuscenes/scripts/export_2d_annotations_as_json.py)
+                # Move them to the ego-pose frame.
+                bbox_3d.translate(-np.array(ego_pose['translation']))
+                bbox_3d.rotate(Quaternion(ego_pose['rotation']).inverse)
+                # Move them to the calibrated sensor frame.
+                bbox_3d.translate(-np.array(front_camera_sensor['translation']))
+                bbox_3d.rotate(Quaternion(front_camera_sensor['rotation']).inverse)
+                # Filter out the corners that are not in front of the calibrated sensor.
+                corners_3d = bbox_3d.corners()
+                in_front = np.argwhere(corners_3d[2, :] > 0).flatten()
+                corners_3d = corners_3d[:, in_front]
+                # Project 3d box to 2d.
+                corner_coords = view_points(corners_3d, camera_intrinsic, True).T[:, :2].tolist()
+                # Keep only corners that fall within the image.
+                final_coords = self._post_process_coords(corner_coords)
+                # Skip if the convex hull of the re-projected corners does not intersect the image canvas.
+                if final_coords is None:
+                    continue
+                min_x, min_y, max_x, max_y = final_coords
+                track_id = self.inst_token_to_track_id[instance_token]
+                bbox_center_by_track_id[track_id] = bbox_3d.center
+                obj_label = {
+                    'frame_name': cam_front_data["filename"],
+                    'track_id': track_id,
+                    'bbox': [min_x, min_y, max_x, max_y],
+                    'class_id': class_id,
+                    'class_name': class_name,
+                    }
+                object_labels.append(obj_label)
+            # Render the furthest bboxes first (closer ones should be on top)
+            object_labels.sort(key=lambda label: np.linalg.norm(bbox_center_by_track_id[label["track_id"]]), reverse=True)
+            # Cache file
+            with open(label_file_path, 'w') as json_file:
+                json.dump(object_labels, json_file)
+            print("Cached labels:", label_file_path)
+            return object_labels
+    def _post_process_coords(self, corner_coords: List) -> Union[Tuple[float, float, float, float], None]:
+        """
+        Get the intersection of the convex hull of the reprojected bbox corners and the image canvas, return None if no intersection.
+        :param corner_coords: Corner coordinates of reprojected bounding box.
+        :param imsize: Size of the image canvas.
+        :return: Intersection of the convex hull of the 2D box corners and the image canvas.
+        """
+        imsize = (self.orig_width, self.orig_height)
+        polygon_from_2d_box = MultiPoint(corner_coords).convex_hull
+        img_canvas = box(0, 0, imsize[0], imsize[1])
+        if polygon_from_2d_box.intersects(img_canvas):
+            img_intersection = polygon_from_2d_box.intersection(img_canvas)
+            intersection_coords = np.array([coord for coord in img_intersection.exterior.coords])
+            min_x = min(intersection_coords[:, 0])
+            min_y = min(intersection_coords[:, 1])
+            max_x = max(intersection_coords[:, 0])
+            max_y = max(intersection_coords[:, 1])
+            return min_x, min_y, max_x, max_y
+        else:
+            return None
+def pre_cache_dataset(dataset_root):
+    # Trigger label and bbox image cache generation
+    dataset_val = NuScenesDataset(root=dataset_root, train=False, clip_length=25, non_overlapping_clips=True)
+    for i in tqdm(range(len(dataset_val))):
+        d = dataset_val[i]
+    dataset_train = NuScenesDataset(root=dataset_root, train=True, clip_length=25, non_overlapping_clips=True)
+    for i in tqdm(range(len(dataset_train))):
+        d = dataset_train[i]
+    print("Done.")
+if __name__ == "__main__":
+    dataset_root = "/path/to/Datasets"
+    pre_cache_dataset(dataset_root)

src/datasets/russia_crash_dataset.py ADDED Viewed

	@@ -0,0 +1,173 @@

+import os
+import json
+from src.datasets.base_dataset import BaseDataset
+class RussiaCrashDataset(BaseDataset):
+    CLASS_NAME_TO_ID = {
+            'person': 1,
+            'car': 3,
+            'truck': 4,
+            'bus': 5,
+            'train': 6,
+            'motorcycle': 7,
+            'bicycle': 8,
+        }
+    def __init__(self,
+                 root='./datasets',
+                 train=True,
+                 clip_length=25,
+                 orig_height=555, orig_width=986,
+                 resize_height=320, resize_width=512,
+                 non_overlapping_clips=False,
+                 bbox_masking_prob=0.0,
+                 sample_clip_from_end=True,
+                 ego_only=False,
+                 specific_samples=None):
+        super(RussiaCrashDataset, self).__init__(root=root,
+                                                 train=train,
+                                                 clip_length=clip_length,
+                                                 resize_height=resize_height,
+                                                 resize_width=resize_width,
+                                                 non_overlapping_clips=non_overlapping_clips,
+                                                 bbox_masking_prob=bbox_masking_prob,
+                                                 sample_clip_from_end=sample_clip_from_end,
+                                                 ego_only=ego_only)
+        self.dataset_name = "preprocess_russia_crash"
+        self.orig_width = orig_width
+        self.orig_height = orig_height
+        self.image_dir = os.path.join(self.root, self.dataset_name, "images", self.data_split)
+        self.label_dir = os.path.join(self.root, self.dataset_name, "labels", self.data_split)
+        self.bbox_image_dir = os.path.join(self.root, self.dataset_name, "bbox_images", self.data_split)
+        self.specific_samples = specific_samples
+        self._collect_clips()
+    def _collect_clips(self):
+        image_indices_by_clip = {}
+        for label_file in sorted(os.listdir(self.label_dir)):
+            if not label_file.endswith('.json'):
+                continue
+            full_filename = os.path.join(self.label_dir, label_file)
+            with open(full_filename) as json_file:
+                all_data = json.load(json_file)
+                metadata = all_data['metadata']
+                # Only include dashcam samples
+                if metadata['camera'] != "Dashcam":
+                    continue
+                # Exclude animal and "other" accidents
+                if metadata['accident_type'] == "Risk of collision/collision with an animal":
+                    continue
+                if metadata['accident_type'] == 'Other types of traffic accidents':
+                    continue
+                # NOTE uncomment to only include actual car collision (no close misses and dangerous events)
+                # if metadata['collision_type'] == "No Collision":
+                #     continue
+                if self.ego_only:
+                    print("Ego collisions only activated!")
+                    if metadata['collision_type'] == "No Collision" or metadata["ego_car_involved"] != "Yes":
+                        continue
+            clip_filename = label_file.split('.')[0]
+            clip_file = os.path.join(self.image_dir, clip_filename)
+            if self.specific_samples is not None and clip_filename not in self.specific_samples:
+                continue
+            if len(os.listdir(clip_file)) < self.clip_length:
+                # print(f"{clip_filename} does not have enough frames: has {len(os.listdir(clip_file))} expected at least {self.clip_length}")
+                continue
+            clip_label_data = self._parse_clip_labels(all_data["data"])
+            self.frame_labels.extend(clip_label_data) # In this case labels are already sorted so they will match up to the image indices
+            image_indices_by_clip[clip_filename] = []
+            for image_file in sorted(os.listdir(clip_file)):
+                self.image_files.append(os.path.join(clip_file, image_file))
+                image_indices_by_clip[clip_filename].append(len(self.image_files)-1)
+            assert len(self.frame_labels) == len(self.image_files) # We assume a one-to-one association between images and labels
+            # Cut the videos in clips of the correct length according to the strategies chosen
+            if not self.non_overlapping_clips:
+                for image_idx in range(len(image_indices_by_clip[clip_filename]) - self.clip_length + 1):
+                    self.clip_list.append(image_indices_by_clip[clip_filename][image_idx:image_idx+self.clip_length])
+            else:
+                if self.sample_clip_from_end:
+                    # In case self.clip_length << actual video sample length, we can create multiple non-overlapping clips for each sample
+                    # Prioritize selecting clips from the end, to make sur the accident is included (which tends to be at the end of the videos)
+                    total_frames = len(image_indices_by_clip[clip_filename])
+                    for clip_i in range(total_frames // self.clip_length):
+                        start_image_idx = total_frames - (self.clip_length * (clip_i + 1))
+                        end_image_idx = total_frames - (self.clip_length * clip_i)
+                        self.clip_list.append(image_indices_by_clip[clip_filename][start_image_idx:end_image_idx])
+                else:
+                    total_frames = len(image_indices_by_clip[clip_filename])
+                    for clip_i in range(total_frames // self.clip_length):
+                        start_image_idx = clip_i * self.clip_length
+                        end_image_idx = start_image_idx + self.clip_length
+                        self.clip_list.append(image_indices_by_clip[clip_filename][start_image_idx:end_image_idx])
+        print("Number of clips Russia_crash:", len(self.clip_list), f"({self.data_split})")
+    def _parse_clip_labels(self, clip_data):
+        frame_labels = []
+        for frame_data in clip_data:
+            obj_data = frame_data['labels']
+            object_labels = []
+            for label in obj_data:
+                # Only keep the classes of interest
+                class_id = RussiaCrashDataset.CLASS_NAME_TO_ID.get(label['name'])
+                if class_id is None:
+                    continue
+                # Convert bbox coordinates to pixel space wrt to image size
+                bbox = label['box']
+                bbox_coords_pixel = [int(bbox[0] * self.orig_width), # x1
+                                     int(bbox[1] * self.orig_height), # y1
+                                     int(bbox[2] * self.orig_width), # x2
+                                     int(bbox[3] * self.orig_height)] # y2
+                object_labels.append({
+                    'frame_name': frame_data["image_source"],
+                    'track_id': int(label['track_id']),
+                    'bbox': bbox_coords_pixel,
+                    'class_id': class_id,
+                    'class_name': label['name'], # Class name of the object
+                    })
+            frame_labels.append(object_labels)
+        return frame_labels
+def pre_cache_dataset(dataset_root):
+    # Trigger label and bbox image cache generation
+    dataset_val = RussiaCrashDataset(root=dataset_root, train=False, clip_length=25, non_overlapping_clips=True)
+    for i in tqdm(range(len(dataset_val))):
+        d = dataset_val[i]
+    dataset_train = RussiaCrashDataset(root=dataset_root, train=True, clip_length=25, non_overlapping_clips=True)
+    for i in tqdm(range(len(dataset_train))):
+        d = dataset_train[i]
+    print("Done.")
+if __name__ == "__main__":
+    from tqdm import tqdm
+    dataset_root = "/path/to/Datasets"
+    pre_cache_dataset(dataset_root)

src/eval/README.md ADDED Viewed

	@@ -0,0 +1,120 @@

+# Video Quality Evaluation Tools
+This directory contains scripts for evaluating video quality metrics between generated and ground truth videos. There are four main evaluation scripts:
+1. `video_quality_metrics_fvd_pair.py`: Evaluates FVD (Fréchet Video Distance) between paired generated and ground truth videos
+2. `video_quality_metrics_fvd_gt_rand.py`: Evaluates FVD using pre-computed ground truth statistics
+3. `video_quality_metrics_jedi_pair.py`: Evaluates JEDi metric between paired generated and ground truth videos
+4. `video_quality_metrics_jedi_gt_rand.py`: Evaluates JEDi metric using random ground truth samples
+## Video Generation
+Before running the evaluation scripts, you'll need to generate video samples using the `run_gen_videos.py` script:
+```bash
+python run_gen_videos.py \
+    --model_path /path/to/model/checkpoint \
+    --output_path /path/to/output/videos \
+    --data_root /path/to/dataset_root \
+    --num_demo_samples 10 \
+    --max_output_vids 200 \
+    --num_gens_per_sample 1 \
+    --eval_output
+```
+### Key Generation Arguments
+```bash
+--model_path PATH     # Path to model checkpoint (required)
+--data_root PATH     # Dataset root path
+--output_path PATH   # Where to save generated videos
+--num_demo_samples N # Number of samples to collect for generation
+--max_output_vids N  # Maximum number of videos to generate
+--num_gens_per_sample N # Videos to generate per test case
+# Optional arguments for controlling generation
+--bbox_mask_idx_batch N1 N2 ...  # Where to start masking (0-25)
+--force_action_type_batch N1 N2 ... # Force specific action types (0-4)
+--guidance_scales N1 N2 ...      # Guidance scales to use
+--seed N                        # Random seed for reproducibility
+--disable_null_model           # Disable null model for unconditional noise
+--use_factor_guidance         # Use factor guidance during generation
+--eval_output                 # Enable evaluation output
+```
+### Action Types
+- 0: Normal driving
+- 1-4: Different types of crash scenarios
+## Common Arguments for Evaluation
+All evaluation scripts share some common command line arguments:
+```bash
+--vid_root PATH       # Root directory containing generated videos (required)
+--samples N           # Number of samples to evaluate (default: 200)
+--num_frames N        # Number of frames per video (default: 25)
+--downsample_int N    # Downsample interval for frames (default: 1)
+--action_type N       # Action type to filter videos (0: normal, 1-4: crash types)
+--shuffle            # Shuffle videos before evaluation
+```
+## FVD Evaluation
+### Paired Evaluation
+```bash
+python video_quality_metrics_fvd_pair.py \
+    --vid_root /path/to/videos \
+    --samples 200 \
+    --num_frames 25 \
+    --downsample
+```
+### Ground Truth Statistics Evaluation
+```bash
+# First, collect ground truth statistics
+python video_quality_metrics_fvd_gt_rand.py \
+    --vid_root /path/to/videos \
+    --collect_stats \
+    --samples 500 \
+    --action_type 1
+# Then evaluate using the collected statistics
+python video_quality_metrics_fvd_gt_rand.py \
+    --vid_root /path/to/videos \
+    --gt_stats /path/to/stats.npz \
+    --samples 200 \
+    --shuffle
+```
+## JEDi Evaluation
+### Paired Evaluation
+```bash
+python video_quality_metrics_jedi_pair.py \
+    --vid_root /path/to/videos \
+    --samples 200 \
+    --num_frames 25 \
+    --test_feature_path /path/to/features
+```
+### Ground Truth Random Evaluation
+```bash
+python video_quality_metrics_jedi_gt_rand.py \
+    --vid_root /path/to/videos \
+    --samples 200 \
+    --gt_samples 500 \
+    --test_feature_path /path/to/features \
+    --action_type 1 \
+    --shuffle
+```
+## Additional Notes
+- The `--action_type` argument can be used to filter videos by category:
+  - 0: Normal driving videos
+  - 1-4: Different types of crash videos
+- For FVD evaluation with ground truth statistics, you can collect statistics once and reuse them for multiple evaluations
+- The JEDi metric requires a test feature path for model loading
+- All scripts support shuffling of videos before evaluation for more robust results
+- The default resolution for videos is 320x512 pixels

src/eval/__pycache__/generate_samples.cpython-310.pyc ADDED Viewed

Binary file (9.13 kB). View file

src/eval/generate_samples.py ADDED Viewed

	@@ -0,0 +1,394 @@

+import os
+from PIL import Image, ImageDraw
+import cv2
+from tqdm import tqdm
+import json
+import argparse
+import warnings
+import numpy as np
+import torch
+torch.cuda.empty_cache()
+import torch.utils.checkpoint
+from accelerate.utils import set_seed
+with warnings.catch_warnings():
+    warnings.simplefilter("ignore")
+    from src.pipelines import StableVideoControlPipeline
+    from src.pipelines import StableVideoControlNullModelPipeline
+    from src.pipelines import StableVideoControlFactorGuidancePipeline
+from src.models import UNetSpatioTemporalConditionModel, ControlNetModel
+from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
+from diffusers.models import AutoencoderKLTemporalDecoder
+from src.datasets.dataset_utils import get_dataloader
+from src.utils import get_samples
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+print(f"Device: {device}")
+generator = None #torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None
+CLIP_LENGTH = 25
+def create_video_from_np(sample, video_path, fps=6):
+    video_filename = f"{video_path}.mp4"
+    frame_size = (512, 320)
+    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+    video_writer_out = cv2.VideoWriter(video_filename, fourcc, fps, frame_size)
+    for img in sample:
+        img = np.transpose(img, (1, 2, 0))
+        img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
+        video_writer_out.write(img)
+    video_writer_out.release()
+    print(f"Video saved: {video_filename}")
+def export_to_video(video_frames, output_video_path=None, fps=6):
+    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
+    h, w, c = video_frames[0].shape
+    video_writer = cv2.VideoWriter(output_video_path, fourcc, fps=fps, frameSize=(w, h))
+    for i in range(len(video_frames)):
+        img = cv2.cvtColor(video_frames[i].astype(np.uint8), cv2.COLOR_RGB2BGR)
+        video_writer.write(img)
+    return output_video_path
+def label_frames_with_action_id(bbox_frames, action_id, masked_idx=None):
+    action_name = {0: "Normal", 1: "Ego", 2: "Ego/Veh", 3: "Veh", 4: "Veh/Veh"}
+    action_text = f"Action: {action_name[action_id]} ({action_id})"
+    for i in range(bbox_frames.shape[0]):
+        # Convert numpy array to PIL Image
+        frame = Image.fromarray(bbox_frames[i].transpose(1, 2, 0))
+        draw = ImageDraw.Draw(frame)
+        # Add text in top right corner
+        text_position = (frame.width - 10, 10)  # 10 pixels from top, 10 pixels from right
+        if masked_idx is not None and masked_idx <= i:
+            text_color = (0, 0, 0)
+            action_text = f"Action: {action_name[action_id]} ({action_id}) [masked]"
+        else:
+            text_color = (255, 255, 255)
+            action_text = action_text
+        draw.text(text_position, action_text, fill=text_color, anchor="ra")
+        # Convert back to numpy array
+        bbox_frames[i] = np.array(frame).transpose(2, 0, 1)
+    return bbox_frames
+def load_ctrlv_pipelines(model_dir, use_null_model=False, use_factor_guidance=False):
+    unet_variant = "fp16" if "stabilityai" in model_dir else None
+    unet = UNetSpatioTemporalConditionModel.from_pretrained(
+        model_dir,
+        subfolder="unet",
+        variant=unet_variant,
+        low_cpu_mem_usage=True,
+        num_frames=CLIP_LENGTH
+    )
+    ctrlnet = ControlNetModel.from_pretrained(
+        model_dir,
+        subfolder="control_net",
+        variant=unet_variant,
+        num_frames=25
+    )
+    if not use_null_model and not use_factor_guidance:
+        pipeline = StableVideoControlPipeline.from_pretrained(
+            "stabilityai/stable-video-diffusion-img2vid-xt",
+            controlnet=ctrlnet,
+            unet=unet,
+            variant=unet_variant
+        )
+    else:
+        # For null model prediction of uncond noise
+        null_model_path = "stabilityai/stable-video-diffusion-img2vid-xt"
+        null_model_unet = UNetSpatioTemporalConditionModel.from_pretrained(
+            null_model_path,
+            subfolder="unet",
+            variant=None,
+            low_cpu_mem_usage=True,
+            num_frames=CLIP_LENGTH
+        )
+        if use_null_model and not use_factor_guidance:
+            pipeline = StableVideoControlNullModelPipeline.from_pretrained(
+                "stabilityai/stable-video-diffusion-img2vid-xt",
+                controlnet=ctrlnet,
+                unet=unet,
+                null_model=null_model_unet,
+                variant=unet_variant
+            )
+        elif use_factor_guidance:
+            pipeline = StableVideoControlFactorGuidancePipeline.from_pretrained(
+                "stabilityai/stable-video-diffusion-img2vid-xt",
+                controlnet=ctrlnet,
+                unet=unet,
+                null_model=null_model_unet,
+                variant=unet_variant
+            )
+    pipeline = pipeline.to(device)
+    pipeline.set_progress_bar_config(disable=True)
+    unet.eval()
+    ctrlnet.eval()
+    return pipeline
+def generate_video_ctrlv(sample, pipeline, video_path="video_out/genvid", json_path="video_out/gt_frames", bbox_mask_frames=None, action_type=None, use_factor_guidance=False, guidance=[1.0, 3.0], video_path2=None):
+    frame_size = (512, 320)
+    FPS = 6
+    CLIP_LENGTH = sample['bbox_images'].shape[0]
+    init_image = sample['image_init']
+    bbox_images = sample['bbox_images'].unsqueeze(0)
+    action_type = sample['action_type'].unsqueeze(0) if action_type is None else action_type
+    sample['bbox_images'].to(device)
+    # Save GT frame paths to json file
+    gt_frame_paths = [file_path[0] for file_path in sample['image_paths']]
+    with open(json_path, "w") as file:
+        json.dump(gt_frame_paths, file, indent=1)
+    print("Saved GT frames json file:", json_path)
+    if not use_factor_guidance:
+        frames = pipeline(init_image,
+                        cond_images=bbox_images,
+                        bbox_mask_frames=bbox_mask_frames,
+                        action_type=action_type,
+                        height=frame_size[1], width=frame_size[0],
+                        decode_chunk_size=8, motion_bucket_id=127, fps=FPS,
+                        num_inference_steps=30,
+                        num_frames=CLIP_LENGTH,
+                        control_condition_scale=1.0,
+                        min_guidance_scale=guidance[0],
+                        max_guidance_scale=guidance[1],
+                        noise_aug_strength=0.01,
+                        generator=generator, output_type='pt').frames[0]
+    else:
+        frames = pipeline(init_image,
+                        cond_images=bbox_images,
+                        bbox_mask_frames=bbox_mask_frames,
+                        action_type=action_type,
+                        height=frame_size[1], width=frame_size[0],
+                        decode_chunk_size=8, motion_bucket_id=127, fps=FPS,
+                        num_inference_steps=30,
+                        num_frames=CLIP_LENGTH,
+                        control_condition_scale=1.0,
+                        min_guidance_scale_img=1.0,
+                        max_guidance_scale_img=3.0,
+                        min_guidance_scale_action=6.0,
+                        max_guidance_scale_action=12.0,
+                        min_guidance_scale_bbox=1.0,
+                        max_guidance_scale_bbox=3.0,
+                        noise_aug_strength=0.01,
+                        generator=generator, output_type='pt').frames[0]
+    frames = frames.detach().cpu().numpy()*255
+    frames = frames.astype(np.uint8)
+    tmp = np.moveaxis(np.transpose(frames, (0, 2, 3, 1)), 0, 0)
+    output_video_path = f"{video_path}.mp4"
+    export_to_video(tmp, output_video_path, fps=FPS)
+    print(f"Video saved:", output_video_path)
+    if video_path2 is not None:
+        output_video_path2 = f"{video_path2}.mp4"
+        export_to_video(tmp, output_video_path2, fps=FPS)
+def generate_samples(args):
+    model_path = args.model_path
+    print("Model path:", model_path)
+    if args.seed is not None:
+        set_seed(args.seed)
+        print("Set seed:", args.seed)
+    # LOAD PIPELINE
+    use_null_model = not args.disable_null_model
+    use_factor_guidance = args.use_factor_guidance
+    pipeline = load_ctrlv_pipelines(model_path, use_null_model=use_null_model, use_factor_guidance=use_factor_guidance)
+    # LOAD DATASET
+    data_root = args.data_root
+    dataset_name = args.dataset
+    train_set = False
+    val_dataset, val_loader = get_dataloader(
+                                            data_root, dataset_name, if_train=train_set, clip_length=CLIP_LENGTH,
+                                            batch_size=1, num_workers=0, shuffle=True,
+                                            image_height=320, image_width=512,
+                                            non_overlapping_clips=True, #specific_samples=specific_samples
+                                            )
+    if train_set:
+        print("WARNING: Currently using training split")
+    # COLLECT SAMPLES
+    num_demo_samples = args.num_demo_samples
+    demo_samples = get_samples(val_loader, num_demo_samples, show_progress=True)
+    sample_range = range(0, num_demo_samples)
+    num_samples = len(sample_range)
+    # video_dir_path = os.path.join(os.getcwd(), "video_out", "video_out_box2video_may1_eval_test")
+    video_dir_path = args.output_path
+    os.makedirs(video_dir_path, exist_ok=True)
+    video_counter = 0
+    # GENERATION PARAMETERS
+    # Set the bbox masking
+    bbox_mask_idx_batch = args.bbox_mask_idx_batch
+    condition_on_last_bbox = False
+    # Set the action type
+    force_action_type = None #1 # 0: Normal, 1: Ego, 2: Ego/Veh, 3: Veh, 4: Veh/Veh
+    force_action_type_batch = args.force_action_type_batch
+    num_gens_per_sample = args.num_gens_per_sample
+    guidance_scales = args.guidance_scales
+    eval_output = args.eval_output
+    # GENERATE VIDEOS
+    # Check for samples that were already done and do not compute them again
+    skip_samples = {}
+    out_video_path = f"{video_dir_path}/gt_ref"
+    if os.path.exists(out_video_path):
+        all_videos = os.listdir(out_video_path)
+        video_counter = len(all_videos)
+        for sample_name in all_videos:
+            vid_name = "_".join(sample_name.split("_")[1:])
+            skip_samples[vid_name] = True
+    print("SKIP SAMPLES:", skip_samples)
+    for guidance in guidance_scales or [-1]:
+        if guidance != -1:
+            print("Guidance:", force_action_type)
+        else:
+            guidance = [1, 3]
+        for _ in range(num_gens_per_sample):
+            for force_action_type in force_action_type_batch or [-1]:
+                if force_action_type != -1:
+                    print("Force action type:", force_action_type)
+                else:
+                    force_action_type = None
+                for bbox_mask_idx in bbox_mask_idx_batch or [-1]:
+                    if bbox_mask_idx != -1:
+                        print("Bbox masking:", bbox_mask_idx)
+                    else:
+                        bbox_mask_idx = None
+                    for i, sample in tqdm(enumerate(demo_samples), desc="Generating samples", total=num_samples):
+                        if i >= list(sample_range)[-1] + 1:
+                            break
+                        if i not in sample_range:
+                            continue
+                        if video_counter > args.max_output_vids:
+                            print(f"MAX OUTPUT VIDS REACHED: {video_counter} >= {args.max_output_vids}")
+                            exit()
+                        vid_name = sample["vid_name"]
+                        mask_hint = "" if bbox_mask_idx is None else f"_bframes:{str(bbox_mask_idx)}"
+                        action_hint = "" if force_action_type is None else f"_action:{str(force_action_type)}"
+                        guidance_hint = "" if guidance_scales is None else f"_guide{guidance[0]}:{guidance[1]}"
+                        scene_name = f"{video_counter}_{vid_name}{mask_hint}{action_hint}{guidance_hint}"
+                        scene_name_no_counter =  "_".join(scene_name.split("_")[1:])
+                        if scene_name_no_counter in skip_samples:
+                            print(f"Skipping sample that was already computed: {vid_name}")
+                            continue
+                        print("Generating video for:", scene_name)
+                        if eval_output:
+                            os.makedirs(f"{video_dir_path}/gen_videos", exist_ok=True)
+                            os.makedirs(f"{video_dir_path}/gt_frames", exist_ok=True)
+                            os.makedirs(f"{video_dir_path}/gt_ref", exist_ok=True)
+                            gt_vid_path = f"{video_dir_path}/gt_ref/{scene_name}/(1)gt_video_{scene_name}"
+                            bbox_out_path_root = f"{video_dir_path}/gt_ref/{scene_name}"
+                            out_video_path = f"{video_dir_path}/gen_videos/genvid_{video_counter}_{vid_name}"
+                            out_json_path = os.path.join(video_dir_path, "gt_frames", f"gt_frames_{video_counter}_{vid_name}.json")
+                            out_video_path2 = f"{bbox_out_path_root}/(3)genvid_adv_{scene_name}"
+                            os.makedirs(bbox_out_path_root, exist_ok=True)
+                        else:
+                            os.makedirs(f"{video_dir_path}/{scene_name}", exist_ok=True)
+                            gt_vid_path = f"{video_dir_path}/{scene_name}/(1)gt_video_{scene_name}"
+                            bbox_out_path_root = f"{video_dir_path}/{scene_name}"
+                            out_video_path = f"{video_dir_path}/{scene_name}/(3)genvid_adv_{scene_name}"
+                            out_json_path = os.path.join(video_dir_path, scene_name, f"gt_frames_{sample['vid_name']}.json")
+                            out_video_path2 = None
+                        create_video_from_np(sample['gt_clip_np'], video_path=gt_vid_path)
+                        # Add action type text to ground truth bounding box frames # TODO: Make sure the action type aligns if we change it for generation
+                        action_type = sample['action_type'].unsqueeze(0)
+                        og_action_type = action_type.item()
+                        if force_action_type is not None:
+                            action_type = torch.ones_like(action_type) * force_action_type
+                        action_id = action_type.item()
+                        bbox_frames = sample['bbox_images_np'].copy()
+                        if bbox_mask_idx is not None:
+                            # print(f"Masking bboxes after index {bbox_mask_idx}")
+                            # Let's save a copy of the original bboxes for reference
+                            bbox_frames_ref = sample['bbox_images_np'].copy()
+                            label_frames_with_action_id(bbox_frames_ref, og_action_type)
+                            create_video_from_np(bbox_frames_ref, video_path=f"{bbox_out_path_root}/(2)video_2dbboxes_{scene_name}_nomask")
+                            # For display, let's mask with white
+                            mask_cond = bbox_mask_idx <= np.arange(CLIP_LENGTH).reshape(CLIP_LENGTH, 1, 1, 1)
+                            if condition_on_last_bbox:
+                                mask_cond[-1, 0, 0, 0] = False
+                            bbox_frames = np.where(mask_cond, np.ones_like(bbox_frames)*255, bbox_frames)
+                            label_frames_with_action_id(bbox_frames, action_id, masked_idx=bbox_mask_idx)
+                        else:
+                            label_frames_with_action_id(bbox_frames, action_id)
+                        create_video_from_np(bbox_frames, video_path=f"{bbox_out_path_root}/(2)video_2dbboxes_{scene_name}")
+                        bbox_mask_frames = [False] * CLIP_LENGTH
+                        if bbox_mask_idx is not None:
+                            bbox_mask_frames[bbox_mask_idx:] = [True] * (len(bbox_mask_frames) - bbox_mask_idx)
+                        if condition_on_last_bbox:
+                            bbox_mask_frames[-1] = False
+                        generate_video_ctrlv(
+                            sample,
+                            pipeline,
+                            video_path=out_video_path,
+                            json_path=out_json_path,
+                            bbox_mask_frames=bbox_mask_frames,
+                            action_type=action_type,
+                            use_factor_guidance=use_factor_guidance,
+                            guidance=guidance,
+                            video_path2=out_video_path2
+                        )
+                        video_counter += 1
+    print("DONE")

src/eval/video_dataset.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import os
+import torch
+from torch.utils.data import Dataset
+import cv2
+import numpy as np
+class VideoDataset(Dataset):
+    def __init__(self, video_root, num_frames=25, downsample_int=1, transform=None):
+        """
+        Args:
+            video_root (str): Directory with all the video files
+            num_frames (int): Number of frames to extract from each video
+            downsample_int (int): Interval between frames to extract
+            transform (callable, optional): Optional transform to be applied on frames
+        """
+        self.video_root = video_root
+        self.num_frames = num_frames
+        self.downsample_int = downsample_int
+        self.transform = transform
+        # Get list of video files
+        self.video_files = []
+        gen_videos = os.path.join(video_root, "gen_videos") if os.path.exists(os.path.join(video_root, "gen_videos")) else video_root
+        for fname in os.listdir(gen_videos):
+            if fname.endswith('.mp4'):
+                self.video_files.append(os.path.join(gen_videos, fname))
+        self.video_files.sort()
+    def __len__(self):
+        return len(self.video_files)
+    def get_frames_mp4(self, video_path):
+        """Extract frames from video file"""
+        cap = cv2.VideoCapture(video_path)
+        if not cap.isOpened():
+            raise ValueError(f"Could not open video file: {video_path}")
+        frames = []
+        frame_count = 0
+        while True:
+            ret, frame = cap.read()
+            if not ret:
+                break
+            frame = cv2.resize(frame, (512, 320))
+            if frame_count % self.downsample_int == 0:
+                frames.append(frame)
+            frame_count += 1
+            if len(frames) >= self.num_frames:
+                break
+        cap.release()
+        if len(frames) < self.num_frames:
+            # Pad with last frame if we don't have enough frames
+            last_frame = frames[-1] if frames else np.zeros((320, 512, 3), dtype=np.uint8)
+            while len(frames) < self.num_frames:
+                frames.append(last_frame)
+        return np.array(frames[:self.num_frames])
+    def __getitem__(self, idx):
+        video_path = self.video_files[idx]
+        frames = self.get_frames_mp4(video_path)
+        # Convert to torch tensor and normalize
+        frames = torch.from_numpy(frames).float()
+        frames = frames.permute(0, 3, 1, 2)  # Change from (T, H, W, C) to (T, C, H, W)
+        frames = frames / (255/2.0) - 1.0  # Normalize to [-1, 1]
+        if self.transform:
+            frames = self.transform(frames)
+        return frames, []

src/eval/video_quality_metrics_fvd_gt_rand.py ADDED Viewed

	@@ -0,0 +1,458 @@

+import numpy as np
+import torch
+import scipy.linalg
+from typing import Tuple
+import torch.nn.functional as F
+import math
+import cv2
+import json
+import random
+import os
+import argparse
+from tqdm import tqdm
+import numpy as np
+import io
+import re
+import requests
+import html
+import hashlib
+import urllib
+import urllib.request
+import uuid
+from distutils.util import strtobool
+from typing import Any, List, Tuple, Union, Dict
+from src.datasets.dataset_utils import get_dataloader
+from src.utils import get_samples
+def get_frames_from_path_list(path_list):
+    frames = []
+    for path in path_list:
+        img = cv2.imread(path)
+        img = cv2.resize(img, [512, 320])
+        frames.append(img)
+    return np.array(frames)
+def get_frames_mp4(video_path: str, frame_interval: int = 1) -> None:
+    # Open the video file
+    cap = cv2.VideoCapture(video_path)
+    if not cap.isOpened():
+        raise ValueError(f"Could not open video file: {video_path}")
+    frame_count = 0
+    saved_count = 0
+    frames = []
+    while True:
+        ret, frame = cap.read()
+        if not ret:
+            break
+        frame = cv2.resize(frame, (512, 320))
+        # Save frame if it's the right interval
+        if frame_count % frame_interval == 0:
+            frames.append(frame)
+            saved_count += 1
+        frame_count += 1
+    cap.release()
+    return np.array(frames)
+def load_json(filename):
+    if os.path.exists(filename):
+        with open(filename, "r") as f:
+            return json.load(f)
+    print(filename, "not found")
+    return []
+def open_url(url: str, num_attempts: int = 10, verbose: bool = True, return_filename: bool = False) -> Any:
+    """Download the given URL and return a binary-mode file object to access the data."""
+    assert num_attempts >= 1
+    # Doesn't look like an URL scheme so interpret it as a local filename.
+    if not re.match('^[a-z]+://', url):
+        return url if return_filename else open(url, "rb")
+    # Handle file URLs.  This code handles unusual file:// patterns that
+    # arise on Windows:
+    #
+    # file:///c:/foo.txt
+    #
+    # which would translate to a local '/c:/foo.txt' filename that's
+    # invalid.  Drop the forward slash for such pathnames.
+    #
+    # If you touch this code path, you should test it on both Linux and
+    # Windows.
+    #
+    # Some internet resources suggest using urllib.request.url2pathname() but
+    # but that converts forward slashes to backslashes and this causes
+    # its own set of problems.
+    if url.startswith('file://'):
+        filename = urllib.parse.urlparse(url).path
+        if re.match(r'^/[a-zA-Z]:', filename):
+            filename = filename[1:]
+        return filename if return_filename else open(filename, "rb")
+    url_md5 = hashlib.md5(url.encode("utf-8")).hexdigest()
+    # Download.
+    url_name = None
+    url_data = None
+    with requests.Session() as session:
+        if verbose:
+            print("Downloading %s ..." % url, end="", flush=True)
+        for attempts_left in reversed(range(num_attempts)):
+            try:
+                with session.get(url) as res:
+                    res.raise_for_status()
+                    if len(res.content) == 0:
+                        raise IOError("No data received")
+                    if len(res.content) < 8192:
+                        content_str = res.content.decode("utf-8")
+                        if "download_warning" in res.headers.get("Set-Cookie", ""):
+                            links = [html.unescape(link) for link in content_str.split('"') if "export=download" in link]
+                            if len(links) == 1:
+                                url = requests.compat.urljoin(url, links[0])
+                                raise IOError("Google Drive virus checker nag")
+                        if "Google Drive - Quota exceeded" in content_str:
+                            raise IOError("Google Drive download quota exceeded -- please try again later")
+                    match = re.search(r'filename="([^"]*)"', res.headers.get("Content-Disposition", ""))
+                    url_name = match[1] if match else url
+                    url_data = res.content
+                    if verbose:
+                        print(" done")
+                    break
+            except KeyboardInterrupt:
+                raise
+            except:
+                if not attempts_left:
+                    if verbose:
+                        print(" failed")
+                    raise
+                if verbose:
+                    print(".", end="", flush=True)
+    # Return data as file object.
+    assert not return_filename
+    return io.BytesIO(url_data)
+"""
+Modified from https://github.com/cvpr2022-stylegan-v/stylegan-v/blob/main/src/metrics/frechet_video_distance.py
+"""
+class FVD:
+    def __init__(self, device,
+                 detector_url='https://www.dropbox.com/s/ge9e5ujwgetktms/i3d_torchscript.pt?dl=1',
+                 rescale=False, resize=False, return_features=True):
+        self.device = device
+        self.detector_kwargs = dict(rescale=False, resize=False, return_features=True)
+        with open_url(detector_url, verbose=False) as f:
+            self.detector = torch.jit.load(f).eval().to(device)
+        # Initialize ground truth statistics
+        self.mu_real = None
+        self.sigma_real = None
+    def to_device(self, device):
+        self.device = device
+        self.detector = self.detector.to(self.device)
+    def _compute_stats(self, feats: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+        mu = feats.mean(axis=0) # [d]
+        sigma = np.cov(feats, rowvar=False) # [d, d]
+        return mu, sigma
+    def save_gt_stats(self, save_path: str):
+        """Save ground truth statistics to a file."""
+        if self.mu_real is None or self.sigma_real is None:
+            raise ValueError("Ground truth statistics not computed yet")
+        stats = {
+            'mu_real': self.mu_real,
+            'sigma_real': self.sigma_real
+        }
+        np.savez(save_path, **stats)
+    def load_gt_stats(self, load_path: str):
+        """Load ground truth statistics from a file."""
+        stats = np.load(load_path)
+        self.mu_real = stats['mu_real']
+        self.sigma_real = stats['sigma_real']
+    def preprocess_videos(self, videos, resolution=224, sequence_length=None):
+        b, t, c, h, w = videos.shape
+        # temporal crop
+        if sequence_length is not None:
+            assert sequence_length <= t
+            videos = videos[:, :sequence_length, ::]
+        # b*t x c x h x w
+        videos = videos.reshape(-1, c, h, w)
+        if c == 1:
+            videos = torch.cat([videos, videos, videos], 1)
+            c = 3
+        # scale shorter side to resolution
+        scale = resolution / min(h, w)
+        # import pdb; pdb.set_trace()
+        if h < w:
+            target_size = (resolution, math.ceil(w * scale))
+        else:
+            target_size = (math.ceil(h * scale), resolution)
+        videos = F.interpolate(videos, size=target_size).clamp(min=-1, max=1)
+        # center crop
+        _, c, h, w = videos.shape
+        h_start = (h - resolution) // 2
+        w_start = (w - resolution) // 2
+        videos = videos[:, :, h_start:h_start + resolution, w_start:w_start + resolution]
+        # b, c, t, w, h
+        videos = videos.reshape(b, t, c, resolution, resolution).permute(0, 2, 1, 3, 4)
+        return videos.contiguous()
+    @torch.no_grad()
+    def evaluate(self, video_fake, video_real=None, res=224, use_saved_stats=False, save_stats_path=None):
+        """Evaluate FVD score.
+        Args:
+            video_fake: Generated videos
+            video_real: Ground truth videos (optional if use_saved_stats=True)
+            res: Resolution for preprocessing
+            use_saved_stats: Whether to use saved ground truth statistics
+        """
+        video_fake = self.preprocess_videos(video_fake, resolution=res)
+        feats_fake = self.detector(video_fake, **self.detector_kwargs).cpu().numpy()
+        if use_saved_stats:
+            if self.mu_real is None or self.sigma_real is None:
+                raise ValueError("Ground truth statistics not loaded. Call load_gt_stats() first.")
+            mu_real = self.mu_real
+            sigma_real = self.sigma_real
+        else:
+            if video_real is None:
+                raise ValueError("video_real must be provided when use_saved_stats=False")
+            video_real = self.preprocess_videos(video_real, resolution=res)
+            feats_real = self.detector(video_real, **self.detector_kwargs).cpu().numpy()
+            mu_real, sigma_real = self._compute_stats(feats_real)
+            # Save the computed statistics
+            self.mu_real = mu_real
+            self.sigma_real = sigma_real
+            if save_stats_path is not None:
+                self.save_gt_stats(save_stats_path)
+        mu_gen, sigma_gen = self._compute_stats(feats_fake)
+        m = np.square(mu_gen - mu_real).sum()
+        s, _ = scipy.linalg.sqrtm(np.dot(sigma_gen, sigma_real), disp=False)
+        fid = np.real(m + np.trace(sigma_gen + sigma_real - s * 2))
+        return fid
+def collect_fvd_stats(data_root, samples=200, downsample_int=1, num_frames=25, save_path=None, action_type=None):
+    """Collect and save ground truth statistics for FVD evaluation."""
+    if save_path is None:
+        save_path = os.path.join(data_root, "gt_fvd_stats.npz")
+    # Set up category filtering if specified
+    specific_categories = None
+    force_clip_type = None
+    if action_type is not None:
+        if action_type == 0:
+            force_clip_type = "normal"
+            print("Collecting normal samples only")
+        else:
+            classes_by_action_type = {
+                1: [61, 62, 13, 14, 15, 16, 17, 18],
+                2: list(range(1, 12 + 1)),
+                3: [37, 39, 41, 42, 44] + list(range(19, 36 + 1)) + list(range(52, 60 + 1)),
+                4: [38, 40, 43, 45, 46, 47, 48, 49, 50, 51]
+            }
+            specific_categories = classes_by_action_type[action_type]
+            force_clip_type = "crash"
+            print("Collecting crash samples from categories:", specific_categories)
+    # Create dataset and dataloader
+    dataset_name = "mmau"
+    train_set = True
+    val_dataset, val_loader = get_dataloader(data_root, dataset_name,
+                                            if_train=train_set, clip_length=num_frames,
+                                            batch_size=1, num_workers=0, shuffle=True,
+                                            image_height=320, image_width=512,
+                                            non_overlapping_clips=True,
+                                            specific_categories=specific_categories,
+                                            force_clip_type=force_clip_type)
+    # Collect video paths
+    gt_videos = []
+    for sample in tqdm(val_loader, desc="Collecting samples", total=samples):
+        vid_path = os.path.dirname(sample["image_paths"][0][0])
+        gt_videos.append(vid_path)
+        if len(gt_videos) >= samples:
+            break
+    random.shuffle(gt_videos)
+    num_found_samples = len(gt_videos)
+    print(f"Found {num_found_samples} ground truth video directories")
+    # Initialize array for all videos
+    all_videos = torch.zeros((num_found_samples, num_frames, 3, 320, 512), device="cuda")
+    # Load and process videos
+    valid = 0
+    for idx, video_path in tqdm(enumerate(gt_videos), desc="Processing videos", total=num_found_samples):
+        if valid == num_found_samples:
+            break
+        # Get list of jpg files in directory
+        frame_files = sorted([f for f in os.listdir(video_path) if f.endswith('.jpg')])
+        if len(frame_files) < num_frames:
+            print(f"Skipping {video_path.split('/')[-1]}, insufficient frames: {len(frame_files)}")
+            continue
+        # Load frames
+        frames = []
+        for frame_file in frame_files[0:num_frames:downsample_int]:
+            frame_path = os.path.join(video_path, frame_file)
+            img = cv2.imread(frame_path)
+            img = cv2.resize(img, (512, 320))
+            frames.append(img)
+        frames = torch.tensor(np.array(frames), device="cuda")
+        # Process frames
+        frames = frames.unsqueeze(0).permute(0, 1, 4, 2, 3)
+        all_videos[valid] = frames[:, :num_frames, ::]
+        valid += 1
+    if valid == 0:
+        raise ValueError("No valid videos found")
+    # Convert to torch tensor and normalize
+    all_videos = all_videos.float()
+    all_videos.div_(255/2.0).sub_(1.0)
+    # Initialize FVD and compute statistics
+    with torch.no_grad():
+        fvd = FVD(device='cuda')
+        video_real = fvd.preprocess_videos(all_videos)
+        feats_real = fvd.detector(video_real, **fvd.detector_kwargs).cpu().numpy()
+        mu_real, sigma_real = fvd._compute_stats(feats_real)
+    # Save statistics
+    stats = {
+        'mu_real': mu_real,
+        'sigma_real': sigma_real,
+        'num_videos': valid,
+        'num_frames': num_frames,
+        'resolution': 320
+    }
+    np.savez(save_path, **stats)
+    print(f"Saved ground truth statistics to {save_path}")
+    # Clean up
+    del fvd, all_videos, video_real, feats_real
+    torch.cuda.empty_cache()
+    return save_path
+def evaluate_vids(vid_root, samples=200, downsample_int=1, num_frames=25, gt_stats=None, shuffle=False):
+    """Evaluate FVD score for generated videos using pre-computed ground truth statistics."""
+    # Initialize FVD and load ground truth statistics
+    fvd = FVD(device='cuda')
+    if gt_stats is not None:
+        fvd.load_gt_stats(gt_stats)
+    # Collect generated video paths
+    f_gen_vid = []
+    gen_videos = os.path.join(vid_root, "gen_videos") if os.path.exists(os.path.join(vid_root, "gen_videos")) else vid_root
+    for fname in os.listdir(gen_videos):
+        f_gen_vid.append(fname)
+    print(f"Number of generated videos: {len(f_gen_vid)}")
+    if not shuffle:
+        f_gen_vid.sort()
+    else:
+        random.shuffle(f_gen_vid)
+    f_gen_vid = f_gen_vid[:samples]
+    # Initialize array for all videos
+    all_gen = np.zeros((samples, num_frames, 3, 320, 512))
+    # Load and process videos
+    valid = 0
+    for idx, fgen in tqdm(enumerate(f_gen_vid)):
+        if valid == samples:
+            break
+        gen_vid_path = os.path.join(gen_videos, fgen)
+        gen_vid = get_frames_mp4(gen_vid_path, frame_interval=downsample_int)
+        if gen_vid.shape[0] < num_frames:
+            print("Skipping, wrong size:", gen_vid.shape[0])
+            continue
+        gen_vid = np.expand_dims(gen_vid, 0).transpose(0, 1, 4, 2, 3)
+        all_gen[valid] = gen_vid[:, :num_frames, ::]
+        valid += 1
+    # Convert to torch tensor and normalize
+    all_gen = torch.from_numpy(all_gen).cuda().float()
+    all_gen /= 255/2.0
+    all_gen -= 1.0
+    # Compute FVD score
+    fvd_score = fvd.evaluate(all_gen, video_real=None, use_saved_stats=True)
+    del fvd
+    print(f'FVD Score: {fvd_score}')
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Evaluate FVD score using pre-computed ground truth statistics')
+    parser.add_argument('--vid_root', type=str, required=True,
+                      help='Root directory containing generated videos')
+    parser.add_argument('--samples', type=int, default=200,
+                      help='Number of samples to evaluate (default: 200)')
+    parser.add_argument('--num_frames', type=int, default=25,
+                      help='Number of frames per video (default: 25)')
+    parser.add_argument('--downsample_int', type=int, default=1,
+                      help='Downsample interval for frames (default: 1)')
+    parser.add_argument('--gt_stats', type=str, default=None,
+                      help='Path to ground truth statistics file (optional)')
+    parser.add_argument('--shuffle', action='store_true',
+                      help='Shuffle videos before evaluation')
+    parser.add_argument('--collect_stats', action='store_true',
+                      help='Collect and save ground truth statistics')
+    parser.add_argument('--data_root', type=str, required=True,
+                      help='Root directory for datasets')
+    parser.add_argument('--action_type', type=int, default=None,
+                      help='Action type to filter videos (0: normal, 1-4: crash types)')
+    args = parser.parse_args()
+    if args.collect_stats:
+        stats_path = collect_fvd_stats(args.data_root, args.samples, args.downsample_int, args.num_frames, args.gt_stats, args.action_type)
+        args.gt_stats = stats_path
+    evaluate_vids(args.vid_root, args.samples, args.downsample_int, args.num_frames, args.gt_stats, args.shuffle)

src/eval/video_quality_metrics_fvd_pair.py ADDED Viewed

	@@ -0,0 +1,349 @@

+import numpy as np
+import torch
+import scipy.linalg
+from typing import Tuple
+import torch.nn.functional as F
+import math
+from torchvision import transforms
+import cv2
+import json
+import argparse
+from tqdm import tqdm
+import lpips
+from skimage.metrics import structural_similarity as ssim
+from skimage.metrics import peak_signal_noise_ratio as psnr
+"""
+Copy-pasted from Copy-pasted from https://github.com/NVlabs/stylegan2-ada-pytorch
+"""
+import ctypes
+import fnmatch
+import importlib
+import inspect
+import numpy as np
+import os
+import shutil
+import sys
+import types
+import io
+import pickle
+import re
+import requests
+import html
+import hashlib
+import glob
+import tempfile
+import urllib
+import urllib.request
+import uuid
+from tqdm import tqdm
+from distutils.util import strtobool
+from typing import Any, List, Tuple, Union, Dict
+def open_url(url: str, num_attempts: int = 10, verbose: bool = True, return_filename: bool = False) -> Any:
+    """Download the given URL and return a binary-mode file object to access the data."""
+    assert num_attempts >= 1
+    # Doesn't look like an URL scheme so interpret it as a local filename.
+    if not re.match('^[a-z]+://', url):
+        return url if return_filename else open(url, "rb")
+    # Handle file URLs.  This code handles unusual file:// patterns that
+    # arise on Windows:
+    #
+    # file:///c:/foo.txt
+    #
+    # which would translate to a local '/c:/foo.txt' filename that's
+    # invalid.  Drop the forward slash for such pathnames.
+    #
+    # If you touch this code path, you should test it on both Linux and
+    # Windows.
+    #
+    # Some internet resources suggest using urllib.request.url2pathname() but
+    # but that converts forward slashes to backslashes and this causes
+    # its own set of problems.
+    if url.startswith('file://'):
+        filename = urllib.parse.urlparse(url).path
+        if re.match(r'^/[a-zA-Z]:', filename):
+            filename = filename[1:]
+        return filename if return_filename else open(filename, "rb")
+    url_md5 = hashlib.md5(url.encode("utf-8")).hexdigest()
+    # Download.
+    url_name = None
+    url_data = None
+    with requests.Session() as session:
+        if verbose:
+            print("Downloading %s ..." % url, end="", flush=True)
+        for attempts_left in reversed(range(num_attempts)):
+            try:
+                with session.get(url) as res:
+                    res.raise_for_status()
+                    if len(res.content) == 0:
+                        raise IOError("No data received")
+                    if len(res.content) < 8192:
+                        content_str = res.content.decode("utf-8")
+                        if "download_warning" in res.headers.get("Set-Cookie", ""):
+                            links = [html.unescape(link) for link in content_str.split('"') if "export=download" in link]
+                            if len(links) == 1:
+                                url = requests.compat.urljoin(url, links[0])
+                                raise IOError("Google Drive virus checker nag")
+                        if "Google Drive - Quota exceeded" in content_str:
+                            raise IOError("Google Drive download quota exceeded -- please try again later")
+                    match = re.search(r'filename="([^"]*)"', res.headers.get("Content-Disposition", ""))
+                    url_name = match[1] if match else url
+                    url_data = res.content
+                    if verbose:
+                        print(" done")
+                    break
+            except KeyboardInterrupt:
+                raise
+            except:
+                if not attempts_left:
+                    if verbose:
+                        print(" failed")
+                    raise
+                if verbose:
+                    print(".", end="", flush=True)
+    # Return data as file object.
+    assert not return_filename
+    return io.BytesIO(url_data)
+"""
+Modified from https://github.com/cvpr2022-stylegan-v/stylegan-v/blob/main/src/metrics/frechet_video_distance.py
+"""
+class FVD:
+    def __init__(self, device,
+                 detector_url='https://www.dropbox.com/s/ge9e5ujwgetktms/i3d_torchscript.pt?dl=1',
+                 rescale=False, resize=False, return_features=True):
+        self.device = device
+        self.detector_kwargs = dict(rescale=False, resize=False, return_features=True)
+        with open_url(detector_url, verbose=False) as f:
+            self.detector = torch.jit.load(f).eval().to(device)
+    def to_device(self, device):
+        self.device = device
+        self.detector = self.detector.to(self.device)
+    def _compute_stats(self, feats: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+        mu = feats.mean(axis=0) # [d]
+        sigma = np.cov(feats, rowvar=False) # [d, d]
+        return mu, sigma
+    def preprocess_videos(self, videos, resolution=224, sequence_length=None):
+        b, t, c, h, w = videos.shape
+        # temporal crop
+        if sequence_length is not None:
+            assert sequence_length <= t
+            videos = videos[:, :sequence_length, ::]
+        # b*t x c x h x w
+        videos = videos.reshape(-1, c, h, w)
+        if c == 1:
+            videos = torch.cat([videos, videos, videos], 1)
+            c = 3
+        # scale shorter side to resolution
+        scale = resolution / min(h, w)
+        # import pdb; pdb.set_trace()
+        if h < w:
+            target_size = (resolution, math.ceil(w * scale))
+        else:
+            target_size = (math.ceil(h * scale), resolution)
+        videos = F.interpolate(videos, size=target_size).clamp(min=-1, max=1)
+        # center crop
+        _, c, h, w = videos.shape
+        h_start = (h - resolution) // 2
+        w_start = (w - resolution) // 2
+        videos = videos[:, :, h_start:h_start + resolution, w_start:w_start + resolution]
+        # b, c, t, w, h
+        videos = videos.reshape(b, t, c, resolution, resolution).permute(0, 2, 1, 3, 4)
+        return videos.contiguous()
+    @torch.no_grad()
+    def evaluate(self, video_fake, video_real, res=224):
+        video_fake = self.preprocess_videos(video_fake,resolution=res)
+        video_real = self.preprocess_videos(video_real,resolution=res)
+        feats_fake = self.detector(video_fake, **self.detector_kwargs).cpu().numpy()
+        feats_real = self.detector(video_real, **self.detector_kwargs).cpu().numpy()
+        mu_gen, sigma_gen = self._compute_stats(feats_fake)
+        mu_real, sigma_real = self._compute_stats(feats_real)
+        m = np.square(mu_gen - mu_real).sum()
+        s, _ = scipy.linalg.sqrtm(np.dot(sigma_gen, sigma_real), disp=False) # pylint: disable=no-member
+        fid = np.real(m + np.trace(sigma_gen + sigma_real - s * 2))
+        return fid
+def evaluate_vids(vid_root, samples=200, downsample=False, num_frames=25):
+    """Evaluate video quality metrics between generated and ground truth videos."""
+    # Collect video paths
+    vid_name_to_gt_frames = {}
+    gt_videos_refs = os.path.join(vid_root, "gt_frames")
+    for fname in os.listdir(gt_videos_refs):
+        vid_name = fname.strip("gt_frames_").split(".")[0]
+        vid_name_to_gt_frames[vid_name] = fname
+    f_gen_vid = []
+    gen_videos = os.path.join(vid_root, "gen_videos")
+    for fname in os.listdir(gen_videos):
+        f_gen_vid.append(fname)
+        vid_name = fname.strip("genvid_").split(".")[0]
+        assert vid_name_to_gt_frames.get(vid_name) is not None, f"{fname} has no matching gt frames"
+    print(f"Number of generated videos: {len(f_gen_vid)}")
+    # Initialize arrays for all videos
+    all_gt = np.zeros((samples, num_frames, 3, 320, 512))
+    all_gen = np.zeros((samples, num_frames, 3, 320, 512))
+    # Load and process videos
+    valid = 0
+    for idx, fgen in tqdm(enumerate(f_gen_vid), desc="Collecting video frames"):
+        if valid == samples:
+            break
+        vid_name = fgen.strip("genvid_").split(".")[0]
+        fgt = vid_name_to_gt_frames[vid_name]
+        gen_vid_path = os.path.join(gen_videos, fgen)
+        gen_vid = get_frames_mp4(gen_vid_path)
+        with open(os.path.join(gt_videos_refs, fgt)) as gt_json:
+            gt_vid = get_frames_from_path_list(json.load(gt_json))
+        if gt_vid.shape[0] < num_frames or gen_vid.shape[0] < num_frames:
+            print("Skipping, wrong size:", gt_vid.shape[0], gen_vid.shape[0])
+            continue
+        gt_vid = np.expand_dims(gt_vid, 0).transpose(0, 1, 4, 2, 3)
+        gen_vid = np.expand_dims(gen_vid, 0).transpose(0, 1, 4, 2, 3)
+        all_gt[valid] = gt_vid[:, :num_frames, ::]
+        all_gen[valid] = gen_vid[:, :num_frames, ::]
+        valid += 1
+    # Convert to torch tensors and normalize
+    all_gt = torch.from_numpy(all_gt).cuda().float()
+    all_gt /= 255/2.0
+    all_gt -= 1.0
+    all_gen = torch.from_numpy(all_gen).cuda().float()
+    all_gen /= 255/2.0
+    all_gen -= 1.0
+    # Compute FVD score
+    fvd = FVD(device='cuda')
+    fvd_score = fvd.evaluate(all_gt, all_gen)
+    del fvd
+    # Compute LPIPS score
+    loss_fn_alex = lpips.LPIPS(net='alex').cuda()
+    lpips_score = 0
+    for idx in range(all_gen.shape[0]):
+        lpips_score += loss_fn_alex(all_gt[idx], all_gen[idx])/all_gen.shape[0]
+    lpips_score = lpips_score.mean().item()
+    del loss_fn_alex
+    # Compute SSIM and PSNR scores
+    all_gen = all_gen.detach().cpu().numpy()
+    all_gt = all_gt.detach().cpu().numpy()
+    ssim_score_vid = np.zeros(samples)
+    ssim_score_image = np.zeros((samples, num_frames))
+    psnr_score_vid = np.zeros(samples)
+    psnr_score_image = np.zeros((samples, num_frames))
+    psnr_score_all = psnr(all_gt, all_gen)
+    for vid_idx in tqdm(range(all_gen.shape[0]), desc="Computing SSIM and PSNR"):
+        for f_idx in range(all_gen.shape[1]):
+            img_gt = all_gt[vid_idx, f_idx]
+            img_gen = all_gen[vid_idx, f_idx]
+            data_range = max(img_gt.max(), img_gen.max()) - min(img_gt.min(), img_gen.min())
+            ssim_score_image[vid_idx, f_idx] = ssim(img_gt, img_gen, channel_axis=0, data_range=data_range, gaussian_weights=True, sigma=1.5)
+            psnr_score_image[vid_idx, f_idx] = psnr(img_gt, img_gen, data_range=data_range)
+        vid_gt = all_gt[vid_idx]
+        vid_gen = all_gen[vid_idx]
+        data_range = max(vid_gt.max(), vid_gen.max()) - min(vid_gt.min(), vid_gen.min())
+        ssim_score_vid[vid_idx] = ssim(vid_gt, vid_gen, channel_axis=1, data_range=data_range, gaussian_weights=True, sigma=1.5)
+        psnr_score_vid[vid_idx] = psnr(vid_gt, vid_gen, data_range=data_range)
+    ssim_score_image_error = np.sqrt(((ssim_score_image - ssim_score_image.mean())**2).sum()/200)
+    psnr_score_image_error = np.sqrt(((psnr_score_image - psnr_score_image.mean())**2).sum()/200)
+    # Print results
+    print(f'FVD Score: {fvd_score}')
+    print(f'LPIPS Score: {lpips_score}')
+    print(f'SSIM Score (per image): {ssim_score_image.mean()}')
+    print(f'SSIM Score Error: {ssim_score_image_error}')
+    print(f'PSNR Score (per image): {psnr_score_image.mean()}')
+    print(f'PSNR Score Error: {psnr_score_image_error}')
+    # Print copy-friendly format
+    print("\nCopy friendly format:")
+    print(f"{fvd_score}, {lpips_score}, {ssim_score_image.mean()}, {psnr_score_image.mean()}")
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Evaluate video quality metrics between generated and ground truth videos')
+    parser.add_argument('--vid_root', type=str, required=True,
+                      help='Root directory containing generated and ground truth videos')
+    parser.add_argument('--samples', type=int, default=200,
+                      help='Number of samples to evaluate (default: 200)')
+    parser.add_argument('--num_frames', type=int, default=25,
+                      help='Number of frames per video (default: 25)')
+    parser.add_argument('--downsample', action='store_true',
+                      help='Downsample videos during evaluation')
+    args = parser.parse_args()
+    evaluate_vids(args.vid_root, args.samples, args.downsample, args.num_frames)
+def get_frames_from_path_list(path_list):
+    frames = []
+    for path in path_list:
+        img = cv2.imread(path)
+        img = cv2.resize(img, [512, 320])
+        frames.append(img)
+    return np.array(frames)
+def get_frames_mp4(video_path: str, frame_interval: int = 1) -> None:
+    # Open the video file
+    cap = cv2.VideoCapture(video_path)
+    if not cap.isOpened():
+        raise ValueError(f"Could not open video file: {video_path}")
+    frame_count = 0
+    saved_count = 0
+    frames = []
+    while True:
+        ret, frame = cap.read()
+        if not ret:
+            break
+        # Save frame if it's the right interval
+        if frame_count % frame_interval == 0:
+            frames.append(frame)
+            saved_count += 1
+        frame_count += 1
+    cap.release()
+    return np.array(frames)

src/eval/video_quality_metrics_jedi_gt_rand.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import numpy as np
+import torch
+import cv2
+import json
+import random
+import os
+import argparse
+from tqdm import tqdm
+from torch.utils.data import DataLoader
+from videojedi import JEDiMetric
+from .video_dataset import VideoDataset
+from src.datasets.dataset_utils import get_dataloader
+def custom_collate(batch):
+    """Custom collate function for DataLoader to handle video clips."""
+    videos, targets = [], []
+    for sample in batch:
+        clips = sample["clips"]
+        videos.append(clips)
+    return torch.utils.data.dataloader.default_collate(videos), targets
+def evaluate_vids(vid_root, samples=200, downsample_int=1, num_frames=25, gt_samples=500, test_feature_path=None, action_type=None, shuffle=False):
+    """Evaluate JEDi metric between generated and ground truth videos."""
+    # Initialize JEDi metric
+    jedi = JEDiMetric(feature_path=vid_root,
+                      test_feature_path=test_feature_path,
+                      model_dir="/path/to/Models")
+    # Create dataset and dataloader for generated videos
+    gen_dataset = VideoDataset(vid_root, num_frames=num_frames, downsample_int=downsample_int)
+    gen_loader = DataLoader(gen_dataset, batch_size=1, shuffle=shuffle, num_workers=4)
+    # Set up category filtering if specified
+    specific_categories = None
+    force_clip_type = None
+    if action_type is not None:
+        if action_type == 0:
+            force_clip_type = "normal"
+            print("Collecting normal samples only")
+        else:
+            classes_by_action_type = {
+                1: [61, 62, 13, 14, 15, 16, 17, 18],
+                2: list(range(1, 12 + 1)),
+                3: [37, 39, 41, 42, 44] + list(range(19, 36 + 1)) + list(range(52, 60 + 1)),
+                4: [38, 40, 43, 45, 46, 47, 48, 49, 50, 51]
+            }
+            specific_categories = classes_by_action_type[action_type]
+            force_clip_type = "crash"
+            print("Collecting crash samples from categories:", specific_categories)
+    # Create dataset and dataloader for ground truth videos
+    dataset_name = "mmau"
+    train_set = True
+    val_dataset, _ = get_dataloader("path/to/Datasets", dataset_name,
+                                   if_train=train_set, clip_length=num_frames,
+                                   batch_size=1, num_workers=0, shuffle=True,
+                                   image_height=320, image_width=512,
+                                   non_overlapping_clips=True,
+                                   specific_categories=specific_categories,
+                                   force_clip_type=force_clip_type)
+    val_loader = DataLoader(val_dataset, batch_size=1, shuffle=True, collate_fn=custom_collate)
+    # Compute JEDi metric
+    jedi.load_features(train_loader=gen_loader, test_loader=val_loader,
+                      num_samples=samples, num_test_samples=gt_samples)
+    jedi_metric = jedi.compute_metric()
+    print(f"JEDi Metric: {jedi_metric}")
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Evaluate JEDi metric between generated and ground truth videos')
+    parser.add_argument('--vid_root', type=str, required=True,
+                      help='Root directory containing generated videos')
+    parser.add_argument('--samples', type=int, default=200,
+                      help='Number of samples to evaluate (default: 200)')
+    parser.add_argument('--gt_samples', type=int, default=500,
+                      help='Number of ground truth samples to use (default: 500)')
+    parser.add_argument('--num_frames', type=int, default=25,
+                      help='Number of frames per video (default: 25)')
+    parser.add_argument('--downsample_int', type=int, default=1,
+                      help='Downsample interval for frames (default: 1)')
+    parser.add_argument('--test_feature_path', type=str, default=None,
+                      help='Path to test features (optional)')
+    parser.add_argument('--action_type', type=int, default=None,
+                      help='Action type to filter videos (0: normal, 1-4: crash types)')
+    parser.add_argument('--shuffle', action='store_true',
+                      help='Shuffle videos before evaluation')
+    args = parser.parse_args()
+    evaluate_vids(args.vid_root, args.samples, args.downsample_int, args.num_frames, args.gt_samples, args.test_feature_path, args.action_type, args.shuffle)

src/eval/video_quality_metrics_jedi_pair.py ADDED Viewed

	@@ -0,0 +1,92 @@

+import torch
+import os
+import argparse
+from torch.utils.data import DataLoader
+from videojedi import JEDiMetric
+from .video_dataset import VideoDataset
+from src.datasets.dataset_utils import get_dataloader
+def custom_collate(batch):
+    videos, targets = [], []
+    for sample in batch:
+        clips = sample["clips"]
+        videos.append(clips)
+    return torch.utils.data.dataloader.default_collate(videos), targets
+def evaluate_vids(vid_root, samples=200, downsample_int=1, num_frames=25, gt_samples=500, test_feature_path=None, action_type=None):
+    """Evaluate JEDi metric between generated and ground truth videos."""
+    # Initialize JEDi metric
+    jedi = JEDiMetric(feature_path=vid_root,
+                      test_feature_path=test_feature_path,
+                      model_dir="/path/to/Models")
+    # Create dataset and dataloader for generated videos
+    gen_dataset = VideoDataset(vid_root, num_frames=num_frames, downsample_int=downsample_int)
+    gen_loader = DataLoader(gen_dataset, batch_size=1, shuffle=False, num_workers=4)
+    # Set up category filtering if specified
+    specific_categories = None
+    force_clip_type = None
+    if action_type is not None:
+        if action_type == 0:
+            force_clip_type = "normal"
+            print("Collecting normal samples only")
+        else:
+            classes_by_action_type = {
+                1: [61, 62, 13, 14, 15, 16, 17, 18],
+                2: list(range(1, 12 + 1)),
+                3: [37, 39, 41, 42, 44] + list(range(19, 36 + 1)) + list(range(52, 60 + 1)),
+                4: [38, 40, 43, 45, 46, 47, 48, 49, 50, 51]
+            }
+            specific_categories = classes_by_action_type[action_type]
+            force_clip_type = "crash"
+            print("Collecting crash samples from categories:", specific_categories)
+    # Get specific samples to evaluate
+    specific_samples = []
+    gen_videos = os.path.join(vid_root, "gen_videos") if os.path.exists(f"{vid_root}/gen_videos") else vid_root
+    for fname in os.listdir(gen_videos):
+        vid_name = fname.strip("genvid_").split(".")[0]
+        gt_vid_name = "_".join(vid_name.split("_")[1:])
+        specific_samples.append(gt_vid_name)
+    # Create dataset and dataloader for ground truth videos
+    dataset_name = "mmau"
+    train_set = False
+    val_dataset, _ = get_dataloader("/path/to/Datasets", dataset_name,
+                                   if_train=train_set, clip_length=num_frames,
+                                   batch_size=1, num_workers=0, shuffle=True,
+                                   image_height=320, image_width=512,
+                                   non_overlapping_clips=True,
+                                   specific_categories=specific_categories,
+                                   force_clip_type=force_clip_type,
+                                   specific_samples=specific_samples)
+    val_loader = DataLoader(val_dataset, batch_size=1, shuffle=True, collate_fn=custom_collate)
+    # Compute JEDi metric
+    jedi.load_features(train_loader=gen_loader, test_loader=val_loader,
+                      num_samples=samples, num_test_samples=gt_samples)
+    jedi_metric = jedi.compute_metric()
+    print(f"JEDi Metric: {jedi_metric}")
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Evaluate JEDi metric between generated and ground truth videos')
+    parser.add_argument('--vid_root', type=str, required=True,
+                      help='Root directory containing generated videos')
+    parser.add_argument('--samples', type=int, default=200,
+                      help='Number of samples to evaluate (default: 200)')
+    parser.add_argument('--gt_samples', type=int, default=500,
+                      help='Number of ground truth samples to use (default: 500)')
+    parser.add_argument('--num_frames', type=int, default=25,
+                      help='Number of frames per video (default: 25)')
+    parser.add_argument('--downsample_int', type=int, default=1,
+                      help='Downsample interval for frames (default: 1)')
+    parser.add_argument('--test_feature_path', type=str, default=None,
+                      help='Path to test features (optional)')
+    parser.add_argument('--action_type', type=int, default=None,
+                      help='Action type to filter videos (0: normal, 1-4: crash types)')
+    args = parser.parse_args()
+    evaluate_vids(args.vid_root, args.samples, args.downsample_int, args.num_frames, args.gt_samples, args.test_feature_path, args.action_type)

src/models/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from src.models.controlnet import ControlNetModel
2	+ from src.models.unet_spatio_temporal_condition import UNetSpatioTemporalConditionModel

src/models/controlnet.py ADDED Viewed

	@@ -0,0 +1,391 @@

+from typing import Any, Dict, List, Optional, Tuple, Union
+import torch
+from torch import nn
+from torch.nn import functional as F
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.models.unets.unet_3d_blocks import UNetMidBlockSpatioTemporal, get_down_block
+from diffusers.loaders import FromOriginalControlNetMixin
+from diffusers.models.embeddings import TimestepEmbedding, Timesteps
+from diffusers.utils import logging
+from diffusers.models import ControlNetModel as ControlNetModel_original
+from diffusers.models.controlnet import ControlNetOutput, zero_module
+from .unet_spatio_temporal_condition import UNetSpatioTemporalConditionModel
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+class ControlNetModel(ControlNetModel_original): # (ModelMixin, ConfigMixin, FromOriginalControlNetMixin):
+    r"""
+    A controlnet for conditional Spatio-Temporal UNet model.
+    Parameters:
+        sample_size (`int` or `Tuple[int, int]`, *optional*, defaults to `None`):
+            Height and width of input/output sample.
+        in_channels (`int`, *optional*, defaults to 8): Number of channels in the input sample.
+        out_channels (`int`, *optional*, defaults to 4): Number of channels in the output.
+        down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlockSpatioTemporal", "CrossAttnDownBlockSpatioTemporal", "CrossAttnDownBlockSpatioTemporal", "DownBlockSpatioTemporal")`):
+            The tuple of downsample blocks to use.
+        block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
+            The tuple of output channels for each block.
+        addition_time_embed_dim: (`int`, defaults to 256):
+            Dimension to to encode the additional time ids.
+        projection_class_embeddings_input_dim (`int`, defaults to 768):
+            The dimension of the projection of encoded `added_time_ids`.
+        layers_per_block (`int`, *optional*, defaults to 2): The number of layers per block.
+        cross_attention_dim (`int` or `Tuple[int]`, *optional*, defaults to 1280):
+            The dimension of the cross attention features.
+        transformer_layers_per_block (`int`, `Tuple[int]`, or `Tuple[Tuple]` , *optional*, defaults to 1):
+            The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for
+            [`~models.unet_3d_blocks.CrossAttnDownBlockSpatioTemporal`], [`~models.unet_3d_blocks.CrossAttnUpBlockSpatioTemporal`],
+            [`~models.unet_3d_blocks.UNetMidBlockSpatioTemporal`].
+        num_attention_heads (`int`, `Tuple[int]`, defaults to `(5, 10, 10, 20)`):
+            The number of attention heads.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        action_dim: (`int`, defaults to 256):
+            Dimension of the action features.
+    """
+    _supports_gradient_checkpointing = True
+    @register_to_config
+    def __init__(
+        self,
+        sample_size: Optional[int] = None,
+        in_channels: int = 8,
+        down_block_types: Tuple[str] = (
+            "CrossAttnDownBlockSpatioTemporal",
+            "CrossAttnDownBlockSpatioTemporal",
+            "CrossAttnDownBlockSpatioTemporal",
+            "DownBlockSpatioTemporal",
+        ),
+        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
+        addition_time_embed_dim: int = 256,
+        projection_class_embeddings_input_dim: int = 768,
+        layers_per_block: Union[int, Tuple[int]] = 2,
+        cross_attention_dim: Union[int, Tuple[int]] = 1024,
+        transformer_layers_per_block: Union[int, Tuple[int], Tuple[Tuple]] = 1,
+        num_attention_heads: Union[int, Tuple[int]] = (5, 10, 20, 20),
+        num_frames: int = 25,
+        action_dim: int = 5,  # Dimension of the action features
+        bbox_embedding_shape: Tuple[int] = (4, 128, 128),
+    ):
+        # calling the super class constructors without calling ControlNetModel_original's
+        ModelMixin.__init__(self)
+        ConfigMixin.__init__(self)
+        FromOriginalControlNetMixin.__init__(self)
+        self.sample_size = sample_size
+        # Check inputs
+        if len(block_out_channels) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
+            )
+        if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}."
+            )
+        if isinstance(cross_attention_dim, list) and len(cross_attention_dim) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `cross_attention_dim` as `down_block_types`. `cross_attention_dim`: {cross_attention_dim}. `down_block_types`: {down_block_types}."
+            )
+        if not isinstance(layers_per_block, int) and len(layers_per_block) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `layers_per_block` as `down_block_types`. `layers_per_block`: {layers_per_block}. `down_block_types`: {down_block_types}."
+            )
+        # input
+        self.conv_in = nn.Conv2d(
+            in_channels,
+            block_out_channels[0],
+            kernel_size=3,
+            padding=1,
+        )
+        # time
+        time_embed_dim = block_out_channels[0] * 4
+        self.time_proj = Timesteps(block_out_channels[0], True, downscale_freq_shift=0)
+        timestep_input_dim = block_out_channels[0]
+        self.time_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
+        self.add_time_proj = Timesteps(addition_time_embed_dim, True, downscale_freq_shift=0)
+        self.add_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
+        # Action projection layer
+        hidden_action_dim = 256
+        self.action_embedding = nn.Embedding(action_dim, hidden_action_dim)
+        self.action_proj = nn.Linear(hidden_action_dim, cross_attention_dim)
+        # Learnable null embedding for bbox masking
+        self.bbox_null_embedding = nn.Parameter(torch.randn(bbox_embedding_shape))
+        self.down_blocks = nn.ModuleList([])
+        self.controlnet_down_blocks = nn.ModuleList([])
+        if isinstance(num_attention_heads, int):
+            num_attention_heads = (num_attention_heads,) * len(down_block_types)
+        if isinstance(cross_attention_dim, int):
+            cross_attention_dim = (cross_attention_dim,) * len(down_block_types)
+        if isinstance(layers_per_block, int):
+            layers_per_block = [layers_per_block] * len(down_block_types)
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = [transformer_layers_per_block] * len(down_block_types)
+        blocks_time_embed_dim = time_embed_dim
+        self.control_conv_in = nn.Conv2d(
+            in_channels//2,
+            block_out_channels[0],
+            kernel_size=3,
+            padding=1,
+        )
+        # # Initialize the re-zero parameter
+        # self.rz_weight = nn.Parameter(torch.Tensor([0]))
+        # down
+        output_channel = block_out_channels[0]
+        controlnet_block = nn.Conv2d(output_channel, output_channel, kernel_size=1)
+        controlnet_block = zero_module(controlnet_block)
+        self.controlnet_down_blocks.append(controlnet_block)
+        for i, down_block_type in enumerate(down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+            down_block = get_down_block(
+                down_block_type,
+                num_layers=layers_per_block[i],
+                transformer_layers_per_block=transformer_layers_per_block[i],
+                in_channels=input_channel,
+                out_channels=output_channel,
+                temb_channels=blocks_time_embed_dim,
+                add_downsample=not is_final_block,
+                resnet_eps=1e-5,
+                cross_attention_dim=cross_attention_dim[i],
+                num_attention_heads=num_attention_heads[i],
+                resnet_act_fn="silu",
+            )
+            self.down_blocks.append(down_block)
+            for _ in range(layers_per_block[i]):
+                controlnet_block = nn.Conv2d(output_channel, output_channel, kernel_size=1)
+                controlnet_block = zero_module(controlnet_block)
+                self.controlnet_down_blocks.append(controlnet_block)
+            if not is_final_block:
+                controlnet_block = nn.Conv2d(output_channel, output_channel, kernel_size=1)
+                controlnet_block = zero_module(controlnet_block)
+                self.controlnet_down_blocks.append(controlnet_block)
+        # mid
+        controlnet_block = nn.Conv2d(block_out_channels[-1], block_out_channels[-1], kernel_size=1)
+        controlnet_block = zero_module(controlnet_block)
+        self.controlnet_mid_block = controlnet_block
+        self.mid_block = UNetMidBlockSpatioTemporal(
+            block_out_channels[-1],
+            temb_channels=blocks_time_embed_dim,
+            transformer_layers_per_block=transformer_layers_per_block[-1],
+            cross_attention_dim=cross_attention_dim[-1],
+            num_attention_heads=num_attention_heads[-1],
+        )
+        # count how many layers upsample the images
+        self.num_upsamplers = 0
+    @classmethod
+    def from_unet(cls,
+        unet: UNetSpatioTemporalConditionModel,
+        load_weights_from_unet: bool = True,
+        action_dim: int = 5,
+        bbox_embedding_shape: Tuple[int] = (4, 128, 128)):
+        ctrlnet = cls(
+            in_channels=unet.config.in_channels,
+            down_block_types=unet.config.down_block_types,
+            block_out_channels=unet.config.block_out_channels,
+            addition_time_embed_dim=unet.config.addition_time_embed_dim,
+            projection_class_embeddings_input_dim=unet.config.projection_class_embeddings_input_dim,
+            layers_per_block=unet.config.layers_per_block,
+            cross_attention_dim=unet.config.cross_attention_dim,
+            transformer_layers_per_block=unet.config.transformer_layers_per_block,
+            num_attention_heads=unet.config.num_attention_heads,
+            num_frames=unet.config.num_frames,
+            action_dim=action_dim,
+            bbox_embedding_shape=bbox_embedding_shape,
+        )
+        unet_keys = set(unet.state_dict().keys())
+        ctrl_keys = set(ctrlnet.state_dict().keys())
+        intersection_keys = ctrl_keys.intersection(unet_keys)
+        for key in ctrl_keys:
+            if key in intersection_keys:
+                if load_weights_from_unet:
+                    ctrlnet.state_dict()[key].copy_(unet.state_dict()[key])
+            # else:
+            #     logger.warning(f"Key {key} not found in UNet model, initializing it randomly.")
+        return ctrlnet
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        timestep: Union[torch.Tensor, float, int],
+        encoder_hidden_states: torch.Tensor,
+        added_time_ids: torch.Tensor,
+        control_cond: torch.FloatTensor = None,
+        action_type: torch.LongTensor = None,
+        conditioning_scale: float = 1.0,
+        return_dict: bool = True,
+    ) -> Union[ControlNetOutput, Tuple]:
+        r"""
+        This approach effectively integrates the forward method of the UNetSpatioTemporalConditionModel with the forward
+        method of the ControlNetModel
+        Args:
+            sample (`torch.FloatTensor`):
+                The noisy input tensor with the following shape `(batch, num_frames, channel, height, width)`.
+            timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input.
+            encoder_hidden_states (`torch.FloatTensor`):
+                The encoder hidden states with shape `(batch, sequence_length, cross_attention_dim)`.
+            added_time_ids: (`torch.FloatTensor`):
+                The additional time ids with shape `(batch, num_additional_ids)`. These are encoded with sinusoidal
+                embeddings and added to the time embeddings.
+            controlnet_cond (`torch.FloatTensor`):
+                The conditional input tensor of shape `(batch_size, sequence_length, hidden_size)`.
+            action_type (`torch.LongTensor`):
+                The action type with shape `(batch_size)`.
+            conditioning_scale (`float`, defaults to `1.0`):
+                The scale factor for ControlNet outputs.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.unet_slatio_temporal.UNetSpatioTemporalConditionOutput`] instead
+                of a plain tuple.
+        Returns:
+            [`~models.controlnet.ControlNetOutput`] **or** `tuple`:
+                If `return_dict` is `True`, a [`~models.controlnet.ControlNetOutput`] is returned, otherwise a tuple is
+                returned where the first element is the sample tensor.
+        """
+        # 1. time
+        timesteps = timestep
+        if len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        batch_size, num_frames = sample.shape[:2]
+        timesteps = timesteps.expand(batch_size)
+        t_emb = self.time_proj(timesteps)
+        # `Timesteps` does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=sample.dtype)
+        emb = self.time_embedding(t_emb)
+        time_embeds = self.add_time_proj(added_time_ids.flatten())
+        time_embeds = time_embeds.reshape((batch_size, -1))
+        time_embeds = time_embeds.to(emb.dtype)
+        aug_emb = self.add_embedding(time_embeds)
+        emb = emb + aug_emb
+        # Flatten the batch and frames dimensions
+        # sample: [batch, frames, channels, height, width] -> [batch * frames, channels, height, width]
+        sample = sample.flatten(0, 1)
+        # control_cond: [batch, frames, channels, height, width] -> [batch * frames, channels, height, width]
+        control_cond = control_cond.flatten(0, 1)
+        # Repeat the embeddings num_video_frames times
+        # emb: [batch, channels] -> [batch * frames, channels]
+        emb = emb.repeat_interleave(num_frames, dim=0)
+        # encoder_hidden_states: [batch, 1, channels] -> [batch * frames, 1, channels]
+        encoder_hidden_states = encoder_hidden_states.repeat_interleave(num_frames, dim=0)
+        # Process action features if provided
+        if action_type is not None:
+            # Embed action features
+            action_type = action_type.to(encoder_hidden_states.device, dtype=torch.long)
+            # Flatten action features to match the batch*frames dimension
+            # action_features: [batch, action_dim] -> [batch * frames, action_dim]
+            if action_type.dim() == 1:
+                action_type = action_type.unsqueeze(0)
+            action_type = action_type.repeat_interleave(num_frames, dim=0)
+            # Project action features to match the embedding dimension
+            action_features = self.action_embedding(action_type)
+            action_emb = self.action_proj(action_features)
+            # Add action embeddings to the encoder_hidden_states
+            # Make sure not to add action embeddings to masked hidden states
+            is_masked_cond = (encoder_hidden_states == 0).all(dim=2).unsqueeze(-1)
+            encoder_hidden_states = torch.where(is_masked_cond, encoder_hidden_states, encoder_hidden_states + action_emb)
+        # 2. pre-process
+        sample = self.conv_in(sample)
+        control_cond = self.control_conv_in(control_cond)
+        sample = sample + control_cond# * self.rz_weight
+        image_only_indicator = torch.zeros(batch_size, num_frames, dtype=sample.dtype, device=sample.device)
+        down_block_res_samples = (sample,)
+        for downsample_block in self.down_blocks:
+            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    image_only_indicator=image_only_indicator,
+                )
+            else:
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    image_only_indicator=image_only_indicator,
+                )
+            down_block_res_samples += res_samples
+        # 4. mid
+        sample = self.mid_block(
+            hidden_states=sample,
+            temb=emb,
+            encoder_hidden_states=encoder_hidden_states,
+            image_only_indicator=image_only_indicator,
+        )
+        # 5. Control net blocks
+        controlnet_down_block_res_samples = ()
+        for down_block_res_sample, controlnet_block in zip(down_block_res_samples, self.controlnet_down_blocks):
+            down_block_res_sample = controlnet_block(down_block_res_sample)
+            controlnet_down_block_res_samples = controlnet_down_block_res_samples + (down_block_res_sample,)
+        down_block_res_samples = controlnet_down_block_res_samples
+        mid_block_res_sample = self.controlnet_mid_block(sample)
+        # 6. scaling
+        down_block_res_samples = [sample * conditioning_scale for sample in down_block_res_samples]
+        mid_block_res_sample = mid_block_res_sample * conditioning_scale
+        if not return_dict:
+            return (down_block_res_samples, mid_block_res_sample)
+        return ControlNetOutput(
+            down_block_res_samples=down_block_res_samples, mid_block_res_sample=mid_block_res_sample
+        )

src/models/unet_spatio_temporal_condition.py ADDED Viewed

	@@ -0,0 +1,169 @@

+from diffusers.loaders import PeftAdapterMixin
+from diffusers import ModelMixin
+from diffusers import UNetSpatioTemporalConditionModel as UNetSpatioTemporalConditionModel_orig
+import torch
+from einops import rearrange
+from typing import Any, Dict, List, Optional, Tuple, Union
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.models.unets.unet_spatio_temporal_condition import UNetSpatioTemporalConditionOutput
+# NOTE: Only added ModelMixin to make it compatible with some older version of diffusers when using from_pretrained
+class UNetSpatioTemporalConditionModel(UNetSpatioTemporalConditionModel_orig, PeftAdapterMixin, ModelMixin):
+    def enable_grad(self, temporal_transformer_block=True, all=False):
+        parameters_list = []
+        for name, param in self.named_parameters():
+            if bool('temporal_transformer_block' in name and temporal_transformer_block) or all:
+                parameters_list.append(param)
+                param.requires_grad = True
+            else:
+                param.requires_grad = False
+        return parameters_list
+    def get_parameters_with_grad(self):
+        return [param for param in self.parameters() if param.requires_grad]
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        timestep: Union[torch.Tensor, float, int],
+        encoder_hidden_states: torch.Tensor,
+        added_time_ids: torch.Tensor,
+        down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
+        mid_block_additional_residuals: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+    ) -> Union[UNetSpatioTemporalConditionOutput, Tuple]:
+        r"""
+        The [`UNetSpatioTemporalConditionModel`] forward method.
+        Args:
+            sample (`torch.FloatTensor`):
+                The noisy input tensor with the following shape `(batch, num_frames, channel, height, width)`.
+            timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input.
+            encoder_hidden_states (`torch.FloatTensor`):
+                The encoder hidden states with shape `(batch, sequence_length, cross_attention_dim)`.
+            added_time_ids: (`torch.FloatTensor`):
+                The additional time ids with shape `(batch, num_additional_ids)`. These are encoded with sinusoidal
+                embeddings and added to the time embeddings.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.unet_slatio_temporal.UNetSpatioTemporalConditionOutput`] instead
+                of a plain tuple.
+        Returns:
+            [`~models.unet_slatio_temporal.UNetSpatioTemporalConditionOutput`] or `tuple`:
+                If `return_dict` is True, an [`~models.unet_slatio_temporal.UNetSpatioTemporalConditionOutput`] is
+                returned, otherwise a `tuple` is returned where the first element is the sample tensor.
+        """
+        is_controlnet = mid_block_additional_residuals is not None and down_block_additional_residuals is not None
+        # 1. time
+        timesteps = timestep
+        if len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        batch_size, num_frames = sample.shape[:2]
+        timesteps = timesteps.expand(batch_size)
+        t_emb = self.time_proj(timesteps)
+        # `Timesteps` does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=sample.dtype)
+        emb = self.time_embedding(t_emb)
+        time_embeds = self.add_time_proj(added_time_ids.flatten())
+        time_embeds = time_embeds.reshape((batch_size, -1))
+        time_embeds = time_embeds.to(emb.dtype)
+        aug_emb = self.add_embedding(time_embeds)
+        emb = emb + aug_emb
+        # Flatten the batch and frames dimensions
+        # sample: [batch, frames, channels, height, width] -> [batch * frames, channels, height, width]
+        sample = sample.flatten(0, 1)
+        # Repeat the embeddings num_video_frames times
+        # emb: [batch, channels] -> [batch * frames, channels]
+        emb = emb.repeat_interleave(num_frames, dim=0)
+        # encoder_hidden_states: [batch, 1, channels] -> [batch * frames, 1, channels]
+        encoder_hidden_states = encoder_hidden_states.repeat_interleave(num_frames, dim=0)
+        # 2. pre-process
+        sample = self.conv_in(sample)
+        image_only_indicator = torch.zeros(batch_size, num_frames, dtype=sample.dtype, device=sample.device)
+        down_block_res_samples = (sample,)
+        for downsample_block in self.down_blocks:
+            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    image_only_indicator=image_only_indicator,
+                )
+            else:
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    image_only_indicator=image_only_indicator,
+                )
+            down_block_res_samples += res_samples
+        if is_controlnet:
+            new_down_block_res_samples = ()
+            for down_block_res_sample, down_block_additional_residual in zip(
+                down_block_res_samples, down_block_additional_residuals
+            ):
+                down_block_res_sample = down_block_res_sample + down_block_additional_residual
+                new_down_block_res_samples = new_down_block_res_samples + (down_block_res_sample,)
+            down_block_res_samples = new_down_block_res_samples
+        # 4. mid
+        sample = self.mid_block(
+            hidden_states=sample,
+            temb=emb,
+            encoder_hidden_states=encoder_hidden_states,
+            image_only_indicator=image_only_indicator,
+        )
+        if is_controlnet:
+            sample = sample + mid_block_additional_residuals
+        # 5. up
+        for i, upsample_block in enumerate(self.up_blocks):
+            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
+            if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    encoder_hidden_states=encoder_hidden_states,
+                    image_only_indicator=image_only_indicator,
+                )
+            else:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    image_only_indicator=image_only_indicator,
+                )
+        # 6. post-process
+        sample = self.conv_norm_out(sample)
+        sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+        # 7. Reshape back to original shape
+        sample = sample.reshape(batch_size, num_frames, *sample.shape[1:])
+        if not return_dict:
+            return (sample,)
+        return UNetSpatioTemporalConditionOutput(sample=sample)

src/pipelines/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .pipeline_video_control import StableVideoControlPipeline
+from .pipeline_video_diffusion import VideoDiffusionPipeline
+from .pipeline_video_control_nullmodel import StableVideoControlNullModelPipeline
+from .pipeline_video_control_factor_guidance import StableVideoControlFactorGuidancePipeline

src/pipelines/pipeline_video_control.py ADDED Viewed

	@@ -0,0 +1,408 @@

+import torch
+from typing import Callable, Dict, List, Optional, Union
+import PIL.Image
+from einops import rearrange
+import time
+from diffusers.pipelines.stable_video_diffusion.pipeline_stable_video_diffusion import (
+    tensor2vid,
+    StableVideoDiffusionPipelineOutput,
+    _append_dims,
+    EXAMPLE_DOC_STRING
+)
+from diffusers import StableVideoDiffusionPipeline as StableVideoDiffusionPipeline_original
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.utils import logging, replace_example_docstring
+from diffusers.utils.torch_utils import randn_tensor
+from src.models import UNetSpatioTemporalConditionModel, ControlNetModel
+from diffusers.models import AutoencoderKLTemporalDecoder
+from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
+from diffusers import EulerDiscreteScheduler
+from diffusers.image_processor import VaeImageProcessor
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+class StableVideoControlPipeline(StableVideoDiffusionPipeline_original):
+    def __init__(
+        self,
+        vae: AutoencoderKLTemporalDecoder,
+        image_encoder: CLIPVisionModelWithProjection,
+        unet: UNetSpatioTemporalConditionModel,
+        controlnet: ControlNetModel,
+        scheduler: EulerDiscreteScheduler,
+        feature_extractor: CLIPImageProcessor,
+        null_model: UNetSpatioTemporalConditionModel = None
+    ):
+        # calling the super class constructors without calling StableVideoDiffusionPipeline_original's
+        DiffusionPipeline.__init__(self)
+        self.register_modules(
+            vae=vae,
+            image_encoder=image_encoder,
+            controlnet=controlnet,
+            unet=unet,
+            scheduler=scheduler,
+            feature_extractor=feature_extractor,
+            null_model=null_model,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+    def check_inputs(self, image, cond_images, height, width):
+        if (
+            not isinstance(image, torch.Tensor)
+            and not isinstance(image, PIL.Image.Image)
+            and not isinstance(image, list)
+        ):
+            raise ValueError(
+                "`image` has to be of type `torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is"
+                f" {type(image)}"
+            )
+        if not isinstance(cond_images, torch.Tensor):
+            raise ValueError(
+                "`cond_images` has to be of type `torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is"
+                f" {type(cond_images)}"
+            )
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+    def _encode_vae_condition(
+        self,
+        cond_image: torch.tensor,
+        device: Union[str, torch.device],
+        num_videos_per_prompt: int,
+        do_classifier_free_guidance: bool,
+        bbox_mask_frames: List[bool] = None
+    ):
+        video_length = cond_image.shape[1]
+        cond_image = cond_image.to(device=device)
+        cond_image = cond_image.to(dtype=self.vae.dtype)
+        if cond_image.shape[2] == 3:
+            cond_image = rearrange(cond_image, "b f c h w -> (b f) c h w")
+            cond_em = self.vae.encode(cond_image).latent_dist.mode()
+            cond_em = rearrange(cond_em, "(b f) c h w -> b f c h w", f=video_length)
+        else:
+            assert cond_image.shape[2] == 4, "The input tensor should have 3 or 4 channels. 3 for frames and 4 for latents."
+            cond_em = cond_image
+        # duplicate cond_em for each generation per prompt, using mps friendly method
+        cond_em = cond_em.repeat(num_videos_per_prompt, 1, 1, 1, 1)
+        # Bbox conditioning masking during inference (requiring the model to predict behaviour instead)
+        if bbox_mask_frames is not None:
+            mask_cond = torch.tensor(bbox_mask_frames, device=cond_em.device).view(num_videos_per_prompt, video_length, 1, 1, 1)
+            null_embedding = self.controlnet.bbox_null_embedding.repeat(num_videos_per_prompt, video_length, 1, 1, 1)
+            cond_em = torch.where(mask_cond, null_embedding, cond_em)
+        if do_classifier_free_guidance:
+            # negative_cond_em = torch.zeros_like(cond_em)
+            negative_cond_em = self.controlnet.bbox_null_embedding.repeat(num_videos_per_prompt, video_length, 1, 1, 1)
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            cond_em = torch.cat([negative_cond_em, cond_em])
+        return cond_em
+    @property
+    def do_classifier_free_guidance(self):
+        # Don't do the normal CFG when using null model. The null model will take care of computing the unconditional noise
+        if self.null_model is not None:
+            return False
+        if isinstance(self.guidance_scale, (int, float)):
+            return self.guidance_scale > 1
+        return self.guidance_scale.max() > 1
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        image: Union[PIL.Image.Image, List[PIL.Image.Image], torch.FloatTensor],
+        cond_images: torch.FloatTensor = None,
+        bbox_mask_frames: List[bool] = None,
+        action_type: torch.FloatTensor = None,
+        height: int = 576,
+        width: int = 1024,
+        num_frames: Optional[int] = None,
+        num_inference_steps: int = 25,
+        min_guidance_scale: float = 1.0,
+        max_guidance_scale: float = 3.0,
+        control_condition_scale: float=1.0,
+        fps: int = 7,
+        motion_bucket_id: int = 127,
+        noise_aug_strength: float = 0.02,
+        decode_chunk_size: Optional[int] = None,
+        num_videos_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        return_dict: bool = True,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+        Args:
+            image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.FloatTensor`):
+                Image(s) to guide image generation. If you provide a tensor, the expected value range is between `[0,
+                1]`.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_frames (`int`, *optional*):
+                The number of video frames to generate. Defaults to `self.unet.config.num_frames` (14 for
+                `stable-video-diffusion-img2vid` and to 25 for `stable-video-diffusion-img2vid-xt`).
+            num_inference_steps (`int`, *optional*, defaults to 25):
+                The number of denoising steps. More denoising steps usually lead to a higher quality video at the
+                expense of slower inference. This parameter is modulated by `strength`.
+            min_guidance_scale (`float`, *optional*, defaults to 1.0):
+                The minimum guidance scale. Used for the classifier free guidance with first frame.
+            max_guidance_scale (`float`, *optional*, defaults to 3.0):
+                The maximum guidance scale. Used for the classifier free guidance with last frame.
+            fps (`int`, *optional*, defaults to 7):
+                Frames per second. The rate at which the generated images shall be exported to a video after
+                generation. Note that Stable Diffusion Video's UNet was micro-conditioned on fps-1 during training.
+            motion_bucket_id (`int`, *optional*, defaults to 127):
+                Used for conditioning the amount of motion for the generation. The higher the number the more motion
+                will be in the video.
+            noise_aug_strength (`float`, *optional*, defaults to 0.02):
+                The amount of noise added to the init image, the higher it is the less the video will look like the
+                init image. Increase it for more motion.
+            action_type (`torch.FloatTensor`, *optional*, defaults to None):
+                The action type to condition the generation. These features are used by the ControlNet
+                to influence the generation process. The features should be of shape `[batch_size, 1]`.
+            decode_chunk_size (`int`, *optional*):
+                The number of frames to decode at a time. Higher chunk size leads to better temporal consistency at the
+                expense of more memory usage. By default, the decoder decodes all frames at once for maximal quality.
+                For lower memory usage, reduce `decode_chunk_size`.
+            num_videos_per_prompt (`int`, *optional*, defaults to 1):
+                The number of videos to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `pil`, `np` or `pt`.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that is called at the end of each denoising step during inference. The function is called
+                with the following arguments:
+                    `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`.
+                `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+        Examples:
+        Returns:
+            [`~pipelines.stable_diffusion.StableVideoDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableVideoDiffusionPipelineOutput`] is
+                returned, otherwise a `tuple` of (`List[List[PIL.Image.Image]]` or `np.ndarray` or `torch.FloatTensor`)
+                is returned.
+        """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+        num_frames = num_frames if num_frames is not None else self.unet.config.num_frames
+        decode_chunk_size = decode_chunk_size if decode_chunk_size is not None else num_frames
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(image, cond_images, height, width)
+        # 2. Define call parameters
+        if isinstance(image, PIL.Image.Image):
+            batch_size = 1
+        elif isinstance(image, list):
+            batch_size = len(image)
+        else:
+            batch_size = image.shape[0]
+        device = self._execution_device
+        vae_device = self.vae.device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        self._guidance_scale = max_guidance_scale
+        # 3. Encode input image
+        image_embeddings = self._encode_image(image, device, num_videos_per_prompt, self.do_classifier_free_guidance)
+        # NOTE: Stable Video Diffusion was conditioned on fps - 1, which is why it is reduced here.
+        # See: https://github.com/Stability-AI/generative-models/blob/ed0997173f98eaf8f4edf7ba5fe8f15c6b877fd3/scripts/sampling/simple_video_sample.py#L188
+        fps = fps - 1
+        # 4. Encode input image using VAE
+        image = self.image_processor.preprocess(image, height=height, width=width).to(device)
+        noise = randn_tensor(image.shape, generator=generator, device=device, dtype=image.dtype)
+        image = image + noise_aug_strength * noise
+        needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+        if needs_upcasting:
+            self.vae.to(dtype=torch.float32)
+        image_latents = self._encode_vae_image(
+            image,
+            device=device,
+            num_videos_per_prompt=num_videos_per_prompt,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+        )
+        image_latents = image_latents.to(image_embeddings.dtype)
+        # Repeat the image latents for each frame so we can concatenate them with the noise
+        # image_latents [batch, channels, height, width] -> [batch, num_frames, channels, height, width]
+        image_latents = image_latents.unsqueeze(1).repeat(1, num_frames, 1, 1, 1)
+        # 5. Get Added Time IDs
+        added_time_ids = self._get_add_time_ids(
+            fps,
+            motion_bucket_id,
+            noise_aug_strength,
+            image_embeddings.dtype,
+            batch_size,
+            num_videos_per_prompt,
+            self.do_classifier_free_guidance,
+        )
+        added_time_ids = added_time_ids.to(device)
+        # 6. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+        # 7a. Prepare latent variables
+        num_channels_latents = self.unet.config.out_channels*2
+        latents = self.prepare_latents(
+            batch_size * num_videos_per_prompt,
+            num_frames,
+            num_channels_latents,
+            height,
+            width,
+            image_embeddings.dtype,
+            device,
+            generator,
+            latents,
+        )
+        # 7b. Prepare control latent embeds
+        if not cond_images is None:
+            cond_em = self._encode_vae_condition(cond_images,
+                                                device,
+                                                num_videos_per_prompt,
+                                                self.do_classifier_free_guidance,
+                                                bbox_mask_frames=bbox_mask_frames)
+            cond_em = cond_em.to(image_embeddings.dtype)
+        else:
+            cond_em = None
+        # 7c. Prepare action features
+        if not action_type is None:
+            if self.do_classifier_free_guidance:
+                action_type = torch.cat([torch.zeros_like(action_type).unsqueeze(0), action_type.unsqueeze(0)])
+        else:
+            action_type = None
+        # 8. Prepare guidance scale
+        guidance_scale = torch.linspace(min_guidance_scale, max_guidance_scale, num_frames).unsqueeze(0)
+        guidance_scale = guidance_scale.to(device, latents.dtype)
+        guidance_scale = guidance_scale.repeat(batch_size * num_videos_per_prompt, 1)
+        guidance_scale = _append_dims(guidance_scale, latents.ndim)
+        self._guidance_scale = guidance_scale
+        # 9. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                # Concatenate image_latents over channels dimension
+                latent_model_input = torch.cat([latent_model_input, image_latents], dim=2)
+                down_block_additional_residuals, mid_block_additional_residuals = self.controlnet(
+                    latent_model_input,
+                    timestep=t,
+                    encoder_hidden_states=image_embeddings,
+                    added_time_ids=added_time_ids,
+                    control_cond=cond_em,
+                    action_type=action_type,
+                    conditioning_scale=control_condition_scale,
+                    return_dict=False,
+                )
+                # predict the noise residual
+                noise_pred = self.unet(
+                    sample=latent_model_input,
+                    timestep=t,
+                    encoder_hidden_states=image_embeddings,
+                    added_time_ids=added_time_ids,
+                    down_block_additional_residuals=down_block_additional_residuals,
+                    mid_block_additional_residuals=mid_block_additional_residuals,
+                    return_dict=False,
+                )[0]
+                # Predict unconditional noise
+                if self.null_model is not None:
+                    t = time.time()
+                    noise_pred_uncond = self.null_model(
+                        latent_model_input,
+                        t,
+                        encoder_hidden_states=image_embeddings,
+                        added_time_ids=added_time_ids,
+                        return_dict=False,
+                    )[0]
+                    print(f"Computed uncond noise in: {time.time()-t:.4f}s")
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_cond = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_cond - noise_pred_uncond)
+                elif self.null_model is not None:
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred - noise_pred_uncond)
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents).prev_sample
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                    latents = callback_outputs.pop("latents", latents)
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+        if not output_type == "latent":
+            frames = self.decode_latents(latents, num_frames, decode_chunk_size)
+            frames = tensor2vid(frames, self.image_processor, output_type=output_type)
+        else:
+            frames = latents
+        # cast back to fp16 if needed
+        if needs_upcasting:
+            self.vae.to(dtype=torch.float16)
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return frames
+        return StableVideoDiffusionPipelineOutput(frames=frames)

src/pipelines/pipeline_video_control_factor_guidance.py ADDED Viewed

	@@ -0,0 +1,615 @@

+import torch
+from typing import Callable, Dict, List, Optional, Union
+import PIL.Image
+from einops import rearrange
+import numpy as np
+from diffusers.pipelines.stable_video_diffusion.pipeline_stable_video_diffusion import (
+    tensor2vid,
+    StableVideoDiffusionPipelineOutput,
+    _append_dims,
+    EXAMPLE_DOC_STRING
+)
+from diffusers import StableVideoDiffusionPipeline as StableVideoDiffusionPipeline_original
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.utils import logging, replace_example_docstring
+from diffusers.utils.torch_utils import randn_tensor
+from src.models import UNetSpatioTemporalConditionModel, ControlNetModel
+from diffusers.models import AutoencoderKLTemporalDecoder
+from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
+from diffusers import EulerDiscreteScheduler
+from diffusers.image_processor import VaeImageProcessor
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+PipelineImageInput = Union[
+    PIL.Image.Image,
+    np.ndarray,
+    torch.FloatTensor,
+    List[PIL.Image.Image],
+    List[np.ndarray],
+    List[torch.FloatTensor],
+]
+class StableVideoControlFactorGuidancePipeline(StableVideoDiffusionPipeline_original):
+    def __init__(
+        self,
+        vae: AutoencoderKLTemporalDecoder,
+        image_encoder: CLIPVisionModelWithProjection,
+        unet: UNetSpatioTemporalConditionModel,
+        controlnet: ControlNetModel,
+        scheduler: EulerDiscreteScheduler,
+        feature_extractor: CLIPImageProcessor,
+        null_model: UNetSpatioTemporalConditionModel,
+    ):
+        # calling the super class constructors without calling StableVideoDiffusionPipeline_original's
+        DiffusionPipeline.__init__(self)
+        self.register_modules(
+            vae=vae,
+            image_encoder=image_encoder,
+            controlnet=controlnet,
+            unet=unet,
+            scheduler=scheduler,
+            feature_extractor=feature_extractor,
+            null_model=null_model,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+    def check_inputs(self, image, cond_images, height, width):
+        if (
+            not isinstance(image, torch.Tensor)
+            and not isinstance(image, PIL.Image.Image)
+            and not isinstance(image, list)
+        ):
+            raise ValueError(
+                "`image` has to be of type `torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is"
+                f" {type(image)}"
+            )
+        if not isinstance(cond_images, torch.Tensor):
+            raise ValueError(
+                "`cond_images` has to be of type `torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is"
+                f" {type(cond_images)}"
+            )
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+    def _get_add_time_ids(
+        self,
+        fps: int,
+        motion_bucket_id: int,
+        noise_aug_strength: float,
+        dtype: torch.dtype,
+        batch_size: int,
+        num_videos_per_prompt: int,
+    ):
+        add_time_ids = [fps, motion_bucket_id, noise_aug_strength]
+        passed_add_embed_dim = self.unet.config.addition_time_embed_dim * len(add_time_ids)
+        expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
+        if expected_add_embed_dim != passed_add_embed_dim:
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
+            )
+        add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+        add_time_ids = add_time_ids.repeat(batch_size * num_videos_per_prompt, 1)
+        return add_time_ids
+    def _encode_image(
+        self,
+        image: PipelineImageInput,
+        device: Union[str, torch.device],
+        num_videos_per_prompt: int,
+    ) -> torch.FloatTensor:
+        dtype = next(self.image_encoder.parameters()).dtype
+        if not isinstance(image, torch.Tensor):
+            image = self.image_processor.pil_to_numpy(image)
+            image = self.image_processor.numpy_to_pt(image)
+            # We normalize the image before resizing to match with the original implementation.
+            # Then we unnormalize it after resizing.
+            image = image * 2.0 - 1.0
+            image = _resize_with_antialiasing(image, (224, 224))
+            image = (image + 1.0) / 2.0
+        # Normalize the image with for CLIP input
+        image = self.feature_extractor(
+            images=image,
+            do_normalize=True,
+            do_center_crop=False,
+            do_resize=False,
+            do_rescale=False,
+            return_tensors="pt",
+        ).pixel_values
+        image = image.to(device=device, dtype=dtype)
+        image_embeddings = self.image_encoder(image).image_embeds
+        image_embeddings = image_embeddings.unsqueeze(1)
+        # duplicate image embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = image_embeddings.shape
+        image_embeddings = image_embeddings.repeat(1, num_videos_per_prompt, 1)
+        image_embeddings = image_embeddings.view(bs_embed * num_videos_per_prompt, seq_len, -1)
+        null_image_embeddings = torch.zeros_like(image_embeddings)
+        return image_embeddings, null_image_embeddings
+    def _encode_vae_image(
+        self,
+        image: torch.Tensor,
+        device: Union[str, torch.device],
+        num_videos_per_prompt: int,
+    ):
+        image = image.to(device=device)
+        image_latents = self.vae.encode(image).latent_dist.mode()
+        # duplicate image_latents for each generation per prompt, using mps friendly method
+        image_latents = image_latents.repeat(num_videos_per_prompt, 1, 1, 1)
+        null_image_latents = torch.zeros_like(image_latents)
+        return image_latents, null_image_latents
+    def _encode_vae_condition(
+        self,
+        cond_image: torch.tensor,
+        device: Union[str, torch.device],
+        num_videos_per_prompt: int,
+        bbox_mask_frames: List[bool] = None
+    ):
+        video_length = cond_image.shape[1]
+        cond_image = cond_image.to(device=device)
+        cond_image = cond_image.to(dtype=self.vae.dtype)
+        if cond_image.shape[2] == 3:
+            cond_image = rearrange(cond_image, "b f c h w -> (b f) c h w")
+            cond_em = self.vae.encode(cond_image).latent_dist.mode()
+            cond_em = rearrange(cond_em, "(b f) c h w -> b f c h w", f=video_length)
+        else:
+            assert cond_image.shape[2] == 4, "The input tensor should have 3 or 4 channels. 3 for frames and 4 for latents."
+            cond_em = cond_image
+        # duplicate cond_em for each generation per prompt, using mps friendly method
+        cond_em = cond_em.repeat(num_videos_per_prompt, 1, 1, 1, 1)
+        # Bbox conditioning masking during inference (requiring the model to predict behaviour instead)
+        if bbox_mask_frames is not None:
+            mask_cond = torch.tensor(bbox_mask_frames, device=cond_em.device).view(num_videos_per_prompt, video_length, 1, 1, 1)
+            null_embedding = self.controlnet.bbox_null_embedding.repeat(num_videos_per_prompt, video_length, 1, 1, 1)
+            cond_em = torch.where(mask_cond, null_embedding, cond_em)
+        null_cond_em = self.controlnet.bbox_null_embedding.repeat(num_videos_per_prompt, video_length, 1, 1, 1)
+        return cond_em, null_cond_em
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        image: Union[PIL.Image.Image, List[PIL.Image.Image], torch.FloatTensor],
+        cond_images: torch.FloatTensor = None,
+        bbox_mask_frames: List[bool] = None,
+        action_type: torch.FloatTensor = None,
+        height: int = 576,
+        width: int = 1024,
+        num_frames: Optional[int] = None,
+        num_inference_steps: int = 25,
+        min_guidance_scale_img: float = 1.0,
+        max_guidance_scale_img: float = 3.0,
+        min_guidance_scale_action: float = 1.0,
+        max_guidance_scale_action: float = 3.0,
+        min_guidance_scale_bbox: float = 1.0,
+        max_guidance_scale_bbox: float = 3.0,
+        control_condition_scale: float=1.0,
+        fps: int = 7,
+        motion_bucket_id: int = 127,
+        noise_aug_strength: float = 0.02,
+        decode_chunk_size: Optional[int] = None,
+        num_videos_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        return_dict: bool = True,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+        Args:
+            image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.FloatTensor`):
+                Image(s) to guide image generation. If you provide a tensor, the expected value range is between `[0,
+                1]`.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_frames (`int`, *optional*):
+                The number of video frames to generate. Defaults to `self.unet.config.num_frames` (14 for
+                `stable-video-diffusion-img2vid` and to 25 for `stable-video-diffusion-img2vid-xt`).
+            num_inference_steps (`int`, *optional*, defaults to 25):
+                The number of denoising steps. More denoising steps usually lead to a higher quality video at the
+                expense of slower inference. This parameter is modulated by `strength`.
+            min_guidance_scale (`float`, *optional*, defaults to 1.0):
+                The minimum guidance scale. Used for the classifier free guidance with first frame.
+            max_guidance_scale (`float`, *optional*, defaults to 3.0):
+                The maximum guidance scale. Used for the classifier free guidance with last frame.
+            fps (`int`, *optional*, defaults to 7):
+                Frames per second. The rate at which the generated images shall be exported to a video after
+                generation. Note that Stable Diffusion Video's UNet was micro-conditioned on fps-1 during training.
+            motion_bucket_id (`int`, *optional*, defaults to 127):
+                Used for conditioning the amount of motion for the generation. The higher the number the more motion
+                will be in the video.
+            noise_aug_strength (`float`, *optional*, defaults to 0.02):
+                The amount of noise added to the init image, the higher it is the less the video will look like the
+                init image. Increase it for more motion.
+            action_type (`torch.FloatTensor`, *optional*, defaults to None):
+                The action type to condition the generation. These features are used by the ControlNet
+                to influence the generation process. The features should be of shape `[batch_size, 1]`.
+            decode_chunk_size (`int`, *optional*):
+                The number of frames to decode at a time. Higher chunk size leads to better temporal consistency at the
+                expense of more memory usage. By default, the decoder decodes all frames at once for maximal quality.
+                For lower memory usage, reduce `decode_chunk_size`.
+            num_videos_per_prompt (`int`, *optional*, defaults to 1):
+                The number of videos to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `pil`, `np` or `pt`.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that is called at the end of each denoising step during inference. The function is called
+                with the following arguments:
+                    `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`.
+                `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+        Examples:
+        Returns:
+            [`~pipelines.stable_diffusion.StableVideoDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableVideoDiffusionPipelineOutput`] is
+                returned, otherwise a `tuple` of (`List[List[PIL.Image.Image]]` or `np.ndarray` or `torch.FloatTensor`)
+                is returned.
+        """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+        num_frames = num_frames if num_frames is not None else self.unet.config.num_frames
+        decode_chunk_size = decode_chunk_size if decode_chunk_size is not None else num_frames
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(image, cond_images, height, width)
+        # 2. Define call parameters
+        if isinstance(image, PIL.Image.Image):
+            batch_size = 1
+        elif isinstance(image, list):
+            batch_size = len(image)
+        else:
+            batch_size = image.shape[0]
+        device = self._execution_device
+        vae_device = self.vae.device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        # self._guidance_scale = max_guidance_scale
+        # 3. Encode input image
+        image_embeddings, null_image_embeddings = self._encode_image(image, device, num_videos_per_prompt)
+        # NOTE: Stable Video Diffusion was conditioned on fps - 1, which is why it is reduced here.
+        # See: https://github.com/Stability-AI/generative-models/blob/ed0997173f98eaf8f4edf7ba5fe8f15c6b877fd3/scripts/sampling/simple_video_sample.py#L188
+        fps = fps - 1
+        # 4. Encode input image using VAE
+        image = self.image_processor.preprocess(image, height=height, width=width).to(device)
+        noise = randn_tensor(image.shape, generator=generator, device=device, dtype=image.dtype)
+        image = image + noise_aug_strength * noise
+        needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+        if needs_upcasting:
+            self.vae.to(dtype=torch.float32)
+        image_latents, null_image_latents = self._encode_vae_image(
+            image,
+            device=device,
+            num_videos_per_prompt=num_videos_per_prompt,
+        )
+        image_latents = image_latents.to(image_embeddings.dtype)
+        null_image_latents = null_image_latents.to(image_embeddings.dtype)
+        # Repeat the image latents for each frame so we can concatenate them with the noise
+        # image_latents [batch, channels, height, width] -> [batch, num_frames, channels, height, width]
+        image_latents = image_latents.unsqueeze(1).repeat(1, num_frames, 1, 1, 1)
+        null_image_latents = null_image_latents.unsqueeze(1).repeat(1, num_frames, 1, 1, 1)
+        # 5. Get Added Time IDs
+        added_time_ids = self._get_add_time_ids(
+            fps,
+            motion_bucket_id,
+            noise_aug_strength,
+            image_embeddings.dtype,
+            batch_size,
+            num_videos_per_prompt,
+        )
+        added_time_ids = added_time_ids.to(device)
+        # TODO: reshape time ids for factor guidance
+        # 6. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+        # 7a. Prepare latent variables
+        num_channels_latents = self.unet.config.out_channels*2
+        latents = self.prepare_latents(
+            batch_size * num_videos_per_prompt,
+            num_frames,
+            num_channels_latents,
+            height,
+            width,
+            image_embeddings.dtype,
+            device,
+            generator,
+            latents,
+        )
+        # 7b. Prepare control latent embeds
+        if not cond_images is None:
+            cond_em, null_cond_em = self._encode_vae_condition(cond_images,
+                                                device,
+                                                num_videos_per_prompt,
+                                                bbox_mask_frames=bbox_mask_frames)
+            cond_em = cond_em.to(image_embeddings.dtype)
+            null_cond_em = null_cond_em.to(image_embeddings.dtype)
+        else:
+            cond_em = None
+            null_cond_em = None
+        # 7c. Prepare action features
+        if action_type is not None:
+            action_type, null_action_type = action_type.unsqueeze(0), torch.zeros_like(action_type).unsqueeze(0)
+        else:
+            action_type = None
+            null_action_type = None
+        # 8. Prepare guidance scales
+        guidance_scale_img = torch.linspace(min_guidance_scale_img, max_guidance_scale_img, num_frames).unsqueeze(0)
+        guidance_scale_img = guidance_scale_img.to(device, latents.dtype)
+        guidance_scale_img = guidance_scale_img.repeat(batch_size * num_videos_per_prompt, 1)
+        guidance_scale_img = _append_dims(guidance_scale_img, latents.ndim)
+        guidance_scale_action = torch.linspace(min_guidance_scale_action, max_guidance_scale_action, num_frames).unsqueeze(0)
+        guidance_scale_action = guidance_scale_action.to(device, latents.dtype)
+        guidance_scale_action = guidance_scale_action.repeat(batch_size * num_videos_per_prompt, 1)
+        guidance_scale_action = _append_dims(guidance_scale_action, latents.ndim)
+        guidance_scale_bbox = torch.linspace(min_guidance_scale_bbox, max_guidance_scale_bbox, num_frames).unsqueeze(0)
+        guidance_scale_bbox = guidance_scale_bbox.to(device, latents.dtype)
+        guidance_scale_bbox = guidance_scale_bbox.repeat(batch_size * num_videos_per_prompt, 1)
+        guidance_scale_bbox = _append_dims(guidance_scale_bbox, latents.ndim)
+        # Build the tensors to batch the different levels of conditioning (used for the factored CFG)
+        # [image_and_bbox_embeddings, image_and_bbox_and_action_embeddings]
+        image_embeddings = torch.cat([image_embeddings, image_embeddings])
+        image_latents = torch.cat([image_latents, image_latents])
+        cond_em = torch.cat([cond_em, cond_em])
+        action_type = torch.cat([null_action_type, action_type])
+        added_time_ids = torch.cat([added_time_ids] * 2)
+        latents = torch.cat([latents] * 2)
+        # 9. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                latent_model_input = latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                # print(latent_model_input.shape, image_latents.shape)
+                # Concatenate image_latents over channels dimension
+                latent_model_input = torch.cat([latent_model_input, image_latents], dim=2)
+                down_block_additional_residuals, mid_block_additional_residuals = self.controlnet(
+                    latent_model_input,
+                    timestep=t,
+                    encoder_hidden_states=image_embeddings,
+                    added_time_ids=added_time_ids,
+                    control_cond=cond_em,
+                    action_type=action_type,
+                    conditioning_scale=control_condition_scale,
+                    return_dict=False,
+                )
+                # predict the noise residual
+                noise_pred = self.unet(
+                    sample=latent_model_input,
+                    timestep=t,
+                    encoder_hidden_states=image_embeddings,
+                    added_time_ids=added_time_ids,
+                    down_block_additional_residuals=down_block_additional_residuals,
+                    mid_block_additional_residuals=mid_block_additional_residuals,
+                    return_dict=False,
+                )[0]
+                # Predict unconditional noise
+                noise_pred_cond_img = self.null_model(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=image_embeddings,
+                    added_time_ids=added_time_ids,
+                    return_dict=False,
+                )[0]
+                # Perform factored CFG
+                # NOTE: Currently discarding the unconditional noise prediction from the finetuned model
+                noise_pred_cond_img_bbox, noise_pred_cond_all = noise_pred.chunk(2)
+                # NOTE: `noise_pred_uncond` is technically the same as `noise_pred_cond_img` since they both condition on the image.
+                #       Therefore we could probably remove `noise_pred_cond_img` and get similar performances and faster inference
+                noise_pred = noise_pred_cond_img \
+                    + guidance_scale_bbox * (noise_pred_cond_img_bbox - noise_pred_cond_img) \
+                    + guidance_scale_action * (noise_pred_cond_all - noise_pred_cond_img_bbox)
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents).prev_sample
+                # print("latents", latents.shape)
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                    latents = callback_outputs.pop("latents", latents)
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+        if not output_type == "latent":
+            frames = self.decode_latents(latents, num_frames, decode_chunk_size)
+            frames = tensor2vid(frames, self.image_processor, output_type=output_type)
+        else:
+            frames = latents
+        # cast back to fp16 if needed
+        if needs_upcasting:
+            self.vae.to(dtype=torch.float16)
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return frames
+        return StableVideoDiffusionPipelineOutput(frames=frames)
+# TODO: Some helper functions from Stable Video Diffusion that we could move elsewhere
+def _resize_with_antialiasing(input, size, interpolation="bicubic", align_corners=True):
+    h, w = input.shape[-2:]
+    factors = (h / size[0], w / size[1])
+    # First, we have to determine sigma
+    # Taken from skimage: https://github.com/scikit-image/scikit-image/blob/v0.19.2/skimage/transform/_warps.py#L171
+    sigmas = (
+        max((factors[0] - 1.0) / 2.0, 0.001),
+        max((factors[1] - 1.0) / 2.0, 0.001),
+    )
+    # Now kernel size. Good results are for 3 sigma, but that is kind of slow. Pillow uses 1 sigma
+    # https://github.com/python-pillow/Pillow/blob/master/src/libImaging/Resample.c#L206
+    # But they do it in the 2 passes, which gives better results. Let's try 2 sigmas for now
+    ks = int(max(2.0 * 2 * sigmas[0], 3)), int(max(2.0 * 2 * sigmas[1], 3))
+    # Make sure it is odd
+    if (ks[0] % 2) == 0:
+        ks = ks[0] + 1, ks[1]
+    if (ks[1] % 2) == 0:
+        ks = ks[0], ks[1] + 1
+    input = _gaussian_blur2d(input, ks, sigmas)
+    output = torch.nn.functional.interpolate(input, size=size, mode=interpolation, align_corners=align_corners)
+    return output
+def _compute_padding(kernel_size):
+    """Compute padding tuple."""
+    # 4 or 6 ints:  (padding_left, padding_right,padding_top,padding_bottom)
+    # https://pytorch.org/docs/stable/nn.html#torch.nn.functional.pad
+    if len(kernel_size) < 2:
+        raise AssertionError(kernel_size)
+    computed = [k - 1 for k in kernel_size]
+    # for even kernels we need to do asymmetric padding :(
+    out_padding = 2 * len(kernel_size) * [0]
+    for i in range(len(kernel_size)):
+        computed_tmp = computed[-(i + 1)]
+        pad_front = computed_tmp // 2
+        pad_rear = computed_tmp - pad_front
+        out_padding[2 * i + 0] = pad_front
+        out_padding[2 * i + 1] = pad_rear
+    return out_padding
+def _filter2d(input, kernel):
+    # prepare kernel
+    b, c, h, w = input.shape
+    tmp_kernel = kernel[:, None, ...].to(device=input.device, dtype=input.dtype)
+    tmp_kernel = tmp_kernel.expand(-1, c, -1, -1)
+    height, width = tmp_kernel.shape[-2:]
+    padding_shape: list[int] = _compute_padding([height, width])
+    input = torch.nn.functional.pad(input, padding_shape, mode="reflect")
+    # kernel and input tensor reshape to align element-wise or batch-wise params
+    tmp_kernel = tmp_kernel.reshape(-1, 1, height, width)
+    input = input.view(-1, tmp_kernel.size(0), input.size(-2), input.size(-1))
+    # convolve the tensor with the kernel.
+    output = torch.nn.functional.conv2d(input, tmp_kernel, groups=tmp_kernel.size(0), padding=0, stride=1)
+    out = output.view(b, c, h, w)
+    return out
+def _gaussian(window_size: int, sigma):
+    if isinstance(sigma, float):
+        sigma = torch.tensor([[sigma]])
+    batch_size = sigma.shape[0]
+    x = (torch.arange(window_size, device=sigma.device, dtype=sigma.dtype) - window_size // 2).expand(batch_size, -1)
+    if window_size % 2 == 0:
+        x = x + 0.5
+    gauss = torch.exp(-x.pow(2.0) / (2 * sigma.pow(2.0)))
+    return gauss / gauss.sum(-1, keepdim=True)
+def _gaussian_blur2d(input, kernel_size, sigma):
+    if isinstance(sigma, tuple):
+        sigma = torch.tensor([sigma], dtype=input.dtype)
+    else:
+        sigma = sigma.to(dtype=input.dtype)
+    ky, kx = int(kernel_size[0]), int(kernel_size[1])
+    bs = sigma.shape[0]
+    kernel_x = _gaussian(kx, sigma[:, 1].view(bs, 1))
+    kernel_y = _gaussian(ky, sigma[:, 0].view(bs, 1))
+    out_x = _filter2d(input, kernel_x[..., None, :])
+    out = _filter2d(out_x, kernel_y[..., None])
+    return out

src/pipelines/pipeline_video_control_nullmodel.py ADDED Viewed

	@@ -0,0 +1,406 @@

+import torch
+from typing import Callable, Dict, List, Optional, Union
+import PIL.Image
+from einops import rearrange
+from diffusers.pipelines.stable_video_diffusion.pipeline_stable_video_diffusion import (
+    tensor2vid,
+    StableVideoDiffusionPipelineOutput,
+    _append_dims,
+    EXAMPLE_DOC_STRING
+)
+from diffusers import StableVideoDiffusionPipeline as StableVideoDiffusionPipeline_original
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.utils import logging, replace_example_docstring
+from diffusers.utils.torch_utils import randn_tensor
+from src.models import UNetSpatioTemporalConditionModel, ControlNetModel
+from diffusers.models import AutoencoderKLTemporalDecoder
+from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
+from diffusers import EulerDiscreteScheduler
+from diffusers.image_processor import VaeImageProcessor
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+class StableVideoControlNullModelPipeline(StableVideoDiffusionPipeline_original):
+    def __init__(
+        self,
+        vae: AutoencoderKLTemporalDecoder,
+        image_encoder: CLIPVisionModelWithProjection,
+        unet: UNetSpatioTemporalConditionModel,
+        controlnet: ControlNetModel,
+        scheduler: EulerDiscreteScheduler,
+        feature_extractor: CLIPImageProcessor,
+        null_model: UNetSpatioTemporalConditionModel,
+    ):
+        # calling the super class constructors without calling StableVideoDiffusionPipeline_original's
+        DiffusionPipeline.__init__(self)
+        self.register_modules(
+            vae=vae,
+            image_encoder=image_encoder,
+            controlnet=controlnet,
+            unet=unet,
+            scheduler=scheduler,
+            feature_extractor=feature_extractor,
+            null_model=null_model,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+    def check_inputs(self, image, cond_images, height, width):
+        if (
+            not isinstance(image, torch.Tensor)
+            and not isinstance(image, PIL.Image.Image)
+            and not isinstance(image, list)
+        ):
+            raise ValueError(
+                "`image` has to be of type `torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is"
+                f" {type(image)}"
+            )
+        if not isinstance(cond_images, torch.Tensor):
+            raise ValueError(
+                "`cond_images` has to be of type `torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is"
+                f" {type(cond_images)}"
+            )
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+    def _encode_vae_condition(
+        self,
+        cond_image: torch.tensor,
+        device: Union[str, torch.device],
+        num_videos_per_prompt: int,
+        do_classifier_free_guidance: bool,
+        bbox_mask_frames: List[bool] = None
+    ):
+        video_length = cond_image.shape[1]
+        cond_image = cond_image.to(device=device)
+        cond_image = cond_image.to(dtype=self.vae.dtype)
+        if cond_image.shape[2] == 3:
+            cond_image = rearrange(cond_image, "b f c h w -> (b f) c h w")
+            cond_em = self.vae.encode(cond_image).latent_dist.mode()
+            cond_em = rearrange(cond_em, "(b f) c h w -> b f c h w", f=video_length)
+        else:
+            assert cond_image.shape[2] == 4, "The input tensor should have 3 or 4 channels. 3 for frames and 4 for latents."
+            cond_em = cond_image
+        # duplicate cond_em for each generation per prompt, using mps friendly method
+        cond_em = cond_em.repeat(num_videos_per_prompt, 1, 1, 1, 1)
+        # Bbox conditioning masking during inference (requiring the model to predict behaviour instead)
+        if bbox_mask_frames is not None:
+            mask_cond = torch.tensor(bbox_mask_frames, device=cond_em.device).view(num_videos_per_prompt, video_length, 1, 1, 1)
+            null_embedding = self.controlnet.bbox_null_embedding.repeat(num_videos_per_prompt, video_length, 1, 1, 1)
+            cond_em = torch.where(mask_cond, null_embedding, cond_em)
+        if do_classifier_free_guidance:
+            # negative_cond_em = torch.zeros_like(cond_em)
+            negative_cond_em = self.controlnet.bbox_null_embedding.repeat(num_videos_per_prompt, video_length, 1, 1, 1)
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            cond_em = torch.cat([negative_cond_em, cond_em])
+        return cond_em
+    @property
+    def do_classifier_free_guidance(self):
+        return False
+        # if isinstance(self.guidance_scale, (int, float)):
+        #     return self.guidance_scale > 1
+        # return self.guidance_scale.max() > 1
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        image: Union[PIL.Image.Image, List[PIL.Image.Image], torch.FloatTensor],
+        cond_images: torch.FloatTensor = None,
+        bbox_mask_frames: List[bool] = None,
+        action_type: torch.FloatTensor = None,
+        height: int = 576,
+        width: int = 1024,
+        num_frames: Optional[int] = None,
+        num_inference_steps: int = 25,
+        min_guidance_scale: float = 1.0,
+        max_guidance_scale: float = 3.0,
+        control_condition_scale: float=1.0,
+        fps: int = 7,
+        motion_bucket_id: int = 127,
+        noise_aug_strength: float = 0.02,
+        decode_chunk_size: Optional[int] = None,
+        num_videos_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        return_dict: bool = True,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+        Args:
+            image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.FloatTensor`):
+                Image(s) to guide image generation. If you provide a tensor, the expected value range is between `[0,
+                1]`.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_frames (`int`, *optional*):
+                The number of video frames to generate. Defaults to `self.unet.config.num_frames` (14 for
+                `stable-video-diffusion-img2vid` and to 25 for `stable-video-diffusion-img2vid-xt`).
+            num_inference_steps (`int`, *optional*, defaults to 25):
+                The number of denoising steps. More denoising steps usually lead to a higher quality video at the
+                expense of slower inference. This parameter is modulated by `strength`.
+            min_guidance_scale (`float`, *optional*, defaults to 1.0):
+                The minimum guidance scale. Used for the classifier free guidance with first frame.
+            max_guidance_scale (`float`, *optional*, defaults to 3.0):
+                The maximum guidance scale. Used for the classifier free guidance with last frame.
+            fps (`int`, *optional*, defaults to 7):
+                Frames per second. The rate at which the generated images shall be exported to a video after
+                generation. Note that Stable Diffusion Video's UNet was micro-conditioned on fps-1 during training.
+            motion_bucket_id (`int`, *optional*, defaults to 127):
+                Used for conditioning the amount of motion for the generation. The higher the number the more motion
+                will be in the video.
+            noise_aug_strength (`float`, *optional*, defaults to 0.02):
+                The amount of noise added to the init image, the higher it is the less the video will look like the
+                init image. Increase it for more motion.
+            action_type (`torch.FloatTensor`, *optional*, defaults to None):
+                The action type to condition the generation. These features are used by the ControlNet
+                to influence the generation process. The features should be of shape `[batch_size, 1]`.
+            decode_chunk_size (`int`, *optional*):
+                The number of frames to decode at a time. Higher chunk size leads to better temporal consistency at the
+                expense of more memory usage. By default, the decoder decodes all frames at once for maximal quality.
+                For lower memory usage, reduce `decode_chunk_size`.
+            num_videos_per_prompt (`int`, *optional*, defaults to 1):
+                The number of videos to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `pil`, `np` or `pt`.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that is called at the end of each denoising step during inference. The function is called
+                with the following arguments:
+                    `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`.
+                `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+        Examples:
+        Returns:
+            [`~pipelines.stable_diffusion.StableVideoDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableVideoDiffusionPipelineOutput`] is
+                returned, otherwise a `tuple` of (`List[List[PIL.Image.Image]]` or `np.ndarray` or `torch.FloatTensor`)
+                is returned.
+        """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+        num_frames = num_frames if num_frames is not None else self.unet.config.num_frames
+        decode_chunk_size = decode_chunk_size if decode_chunk_size is not None else num_frames
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(image, cond_images, height, width)
+        # 2. Define call parameters
+        if isinstance(image, PIL.Image.Image):
+            batch_size = 1
+        elif isinstance(image, list):
+            batch_size = len(image)
+        else:
+            batch_size = image.shape[0]
+        device = self._execution_device
+        vae_device = self.vae.device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        self._guidance_scale = max_guidance_scale
+        # 3. Encode input image
+        image_embeddings = self._encode_image(image, device, num_videos_per_prompt, self.do_classifier_free_guidance)
+        # NOTE: Stable Video Diffusion was conditioned on fps - 1, which is why it is reduced here.
+        # See: https://github.com/Stability-AI/generative-models/blob/ed0997173f98eaf8f4edf7ba5fe8f15c6b877fd3/scripts/sampling/simple_video_sample.py#L188
+        fps = fps - 1
+        # 4. Encode input image using VAE
+        image = self.image_processor.preprocess(image, height=height, width=width).to(device)
+        noise = randn_tensor(image.shape, generator=generator, device=device, dtype=image.dtype)
+        image = image + noise_aug_strength * noise
+        needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+        if needs_upcasting:
+            self.vae.to(dtype=torch.float32)
+        image_latents = self._encode_vae_image(
+            image,
+            device=device,
+            num_videos_per_prompt=num_videos_per_prompt,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+        )
+        image_latents = image_latents.to(image_embeddings.dtype)
+        # Repeat the image latents for each frame so we can concatenate them with the noise
+        # image_latents [batch, channels, height, width] -> [batch, num_frames, channels, height, width]
+        image_latents = image_latents.unsqueeze(1).repeat(1, num_frames, 1, 1, 1)
+        # 5. Get Added Time IDs
+        added_time_ids = self._get_add_time_ids(
+            fps,
+            motion_bucket_id,
+            noise_aug_strength,
+            image_embeddings.dtype,
+            batch_size,
+            num_videos_per_prompt,
+            self.do_classifier_free_guidance,
+        )
+        added_time_ids = added_time_ids.to(device)
+        # 6. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+        # 7a. Prepare latent variables
+        num_channels_latents = self.unet.config.out_channels*2
+        latents = self.prepare_latents(
+            batch_size * num_videos_per_prompt,
+            num_frames,
+            num_channels_latents,
+            height,
+            width,
+            image_embeddings.dtype,
+            device,
+            generator,
+            latents,
+        )
+        # 7b. Prepare control latent embeds
+        if not cond_images is None:
+            cond_em = self._encode_vae_condition(cond_images,
+                                                device,
+                                                num_videos_per_prompt,
+                                                self.do_classifier_free_guidance,
+                                                bbox_mask_frames=bbox_mask_frames)
+            cond_em = cond_em.to(image_embeddings.dtype)
+        else:
+            cond_em = None
+        # 7c. Prepare action features
+        if not action_type is None:
+            if self.do_classifier_free_guidance:
+                action_type = torch.cat([torch.zeros_like(action_type).unsqueeze(0), action_type.unsqueeze(0)])
+        else:
+            action_type = None
+        # 8. Prepare guidance scale
+        guidance_scale = torch.linspace(min_guidance_scale, max_guidance_scale, num_frames).unsqueeze(0)
+        guidance_scale = guidance_scale.to(device, latents.dtype)
+        guidance_scale = guidance_scale.repeat(batch_size * num_videos_per_prompt, 1)
+        guidance_scale = _append_dims(guidance_scale, latents.ndim)
+        self._guidance_scale = guidance_scale
+        # 9. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                # print(latent_model_input.shape, image_latents.shape, self.do_classifier_free_guidance)
+                # Concatenate image_latents over channels dimension
+                latent_model_input = torch.cat([latent_model_input, image_latents], dim=2)
+                # latent_model_input_null_model = latent_model_input.clone().detach()
+                down_block_additional_residuals, mid_block_additional_residuals = self.controlnet(
+                    latent_model_input,
+                    timestep=t,
+                    encoder_hidden_states=image_embeddings,
+                    added_time_ids=added_time_ids,
+                    control_cond=cond_em,
+                    action_type=action_type,
+                    conditioning_scale=control_condition_scale,
+                    return_dict=False,
+                )
+                # predict the noise residual
+                noise_pred = self.unet(
+                    sample=latent_model_input,
+                    timestep=t,
+                    encoder_hidden_states=image_embeddings,
+                    added_time_ids=added_time_ids,
+                    down_block_additional_residuals=down_block_additional_residuals,
+                    mid_block_additional_residuals=mid_block_additional_residuals,
+                    return_dict=False,
+                )[0]
+                # Predict unconditional noise
+                noise_pred_uncond = self.null_model(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=image_embeddings,
+                    added_time_ids=added_time_ids,
+                    return_dict=False,
+                )[0]
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    _, noise_pred_cond = noise_pred.chunk(2) # NOTE: Currently discarding the unconditional noise prediction from the finetuned model
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_cond - noise_pred_uncond)
+                else:
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred - noise_pred_uncond)
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents).prev_sample
+                # print("latents", latents.shape)
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                    latents = callback_outputs.pop("latents", latents)
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+        if not output_type == "latent":
+            frames = self.decode_latents(latents, num_frames, decode_chunk_size)
+            frames = tensor2vid(frames, self.image_processor, output_type=output_type)
+        else:
+            frames = latents
+        # cast back to fp16 if needed
+        if needs_upcasting:
+            self.vae.to(dtype=torch.float16)
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return frames
+        return StableVideoDiffusionPipelineOutput(frames=frames)

src/pipelines/pipeline_video_diffusion.py ADDED Viewed

	@@ -0,0 +1,305 @@

+from diffusers import StableVideoDiffusionPipeline as StableVideoDiffusionPipeline_original
+import torch
+from einops import rearrange
+from diffusers.utils import BaseOutput, logging, replace_example_docstring
+from diffusers.utils.torch_utils import is_compiled_module, randn_tensor
+from typing import Callable, Dict, List, Tuple, Optional, Union
+import PIL.Image
+from diffusers.pipelines.stable_video_diffusion.pipeline_stable_video_diffusion import (
+    tensor2vid,
+    StableVideoDiffusionPipelineOutput,
+    _append_dims,
+    EXAMPLE_DOC_STRING
+)
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+class VideoDiffusionPipeline(StableVideoDiffusionPipeline_original):
+    def _encode_vae_condition(
+        self,
+        cond_image: torch.tensor,
+        device: Union[str, torch.device],
+        num_videos_per_prompt: int,
+        do_classifier_free_guidance: bool,
+    ):
+        video_length = cond_image.shape[1]
+        cond_image = cond_image.to(device=device)
+        cond_image = cond_image.to(dtype=self.vae.dtype)
+        cond_image = rearrange(cond_image, "b f c h w -> (b f) c h w")
+        cond_em = self.vae.encode(cond_image).latent_dist.mode()
+        cond_em = rearrange(cond_em, "(b f) c h w -> b f c h w", f=video_length)
+        # duplicate cond_em for each generation per prompt, using mps friendly method
+        cond_em = cond_em.repeat(num_videos_per_prompt, 1, 1, 1, 1)
+        if do_classifier_free_guidance:
+            negative_cond_em = torch.zeros_like(cond_em)
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            cond_em = torch.cat([negative_cond_em, cond_em])
+        return cond_em
+    def decode_latent_to_video(self, latents,
+                                     decode_chunk_size: Optional[int] = None,
+                                     num_frames: Optional[int] = None,
+                                     output_type: Optional[str] = "pil",):
+        frames = self.decode_latents(latents, num_frames, decode_chunk_size)
+        frames = tensor2vid(frames, self.image_processor, output_type=output_type)
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        image: Union[PIL.Image.Image, List[PIL.Image.Image], torch.FloatTensor],
+        bbox_images: Optional[torch.FloatTensor] = None,
+        bbox_conditions: Optional[Dict[str, Union[torch.FloatTensor, List[Union[float, int]]]]] = None,
+        original_size: Optional[Tuple[int]] = (1242, 375),
+        height: int = 576,
+        width: int = 1024,
+        num_frames: Optional[int] = None,
+        num_inference_steps: int = 25,
+        min_guidance_scale: float = 1.0,
+        max_guidance_scale: float = 3.0,
+        fps: int = 7,
+        motion_bucket_id: int = 127,
+        noise_aug_strength: float = 0.02,
+        decode_chunk_size: Optional[int] = None,
+        num_videos_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        return_dict: bool = True,
+        num_cond_bbox_frames: int=3,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+        Args:
+            image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.FloatTensor`):
+                Image(s) to guide image generation. If you provide a tensor, the expected value range is between `[0,
+                1]`.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_frames (`int`, *optional*):
+                The number of video frames to generate. Defaults to `self.unet.config.num_frames` (14 for
+                `stable-video-diffusion-img2vid` and to 25 for `stable-video-diffusion-img2vid-xt`).
+            num_inference_steps (`int`, *optional*, defaults to 25):
+                The number of denoising steps. More denoising steps usually lead to a higher quality video at the
+                expense of slower inference. This parameter is modulated by `strength`.
+            min_guidance_scale (`float`, *optional*, defaults to 1.0):
+                The minimum guidance scale. Used for the classifier free guidance with first frame.
+            max_guidance_scale (`float`, *optional*, defaults to 3.0):
+                The maximum guidance scale. Used for the classifier free guidance with last frame.
+            fps (`int`, *optional*, defaults to 7):
+                Frames per second. The rate at which the generated images shall be exported to a video after
+                generation. Note that Stable Diffusion Video's UNet was micro-conditioned on fps-1 during training.
+            motion_bucket_id (`int`, *optional*, defaults to 127):
+                Used for conditioning the amount of motion for the generation. The higher the number the more motion
+                will be in the video.
+            noise_aug_strength (`float`, *optional*, defaults to 0.02):
+                The amount of noise added to the init image, the higher it is the less the video will look like the
+                init image. Increase it for more motion.
+            decode_chunk_size (`int`, *optional*):
+                The number of frames to decode at a time. Higher chunk size leads to better temporal consistency at the
+                expense of more memory usage. By default, the decoder decodes all frames at once for maximal quality.
+                For lower memory usage, reduce `decode_chunk_size`.
+            num_videos_per_prompt (`int`, *optional*, defaults to 1):
+                The number of videos to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `pil`, `np` or `pt`.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that is called at the end of each denoising step during inference. The function is called
+                with the following arguments:
+                    `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`.
+                `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+        Examples:
+        Returns:
+            [`~pipelines.stable_diffusion.StableVideoDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableVideoDiffusionPipelineOutput`] is
+                returned, otherwise a `tuple` of (`List[List[PIL.Image.Image]]` or `np.ndarray` or `torch.FloatTensor`)
+                is returned.
+        """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+        num_frames = num_frames if num_frames is not None else self.unet.config.num_frames
+        decode_chunk_size = decode_chunk_size if decode_chunk_size is not None else num_frames
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(image, height, width)
+        # 2. Define call parameters
+        if isinstance(image, PIL.Image.Image):
+            batch_size = 1
+        elif isinstance(image, list):
+            batch_size = len(image)
+        else:
+            batch_size = image.shape[0]
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        self._guidance_scale = max_guidance_scale
+        # 3. Encode input image
+        image_embeddings = self._encode_image(image, device, num_videos_per_prompt, self.do_classifier_free_guidance)
+        # NOTE: Stable Video Diffusion was conditioned on fps - 1, which is why it is reduced here.
+        # See: https://github.com/Stability-AI/generative-models/blob/ed0997173f98eaf8f4edf7ba5fe8f15c6b877fd3/scripts/sampling/simple_video_sample.py#L188
+        fps = fps - 1
+        # 4. Encode input image using VAE
+        image = self.image_processor.preprocess(image, height=height, width=width).to(device)
+        noise = randn_tensor(image.shape, generator=generator, device=device, dtype=image.dtype)
+        image = image + noise_aug_strength * noise
+        needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+        if needs_upcasting:
+            self.vae.to(dtype=torch.float32)
+        image_latents = self._encode_vae_image(
+            image,
+            device=device,
+            num_videos_per_prompt=num_videos_per_prompt,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+        )
+        image_latents = image_latents.to(image_embeddings.dtype)
+        # Repeat the image latents for each frame so we can concatenate them with the noise
+        # image_latents [batch, channels, height, width] ->[batch, num_frames, channels, height, width]
+        image_latents = image_latents.unsqueeze(1).repeat(1, num_frames, 1, 1, 1)
+        # 7b. Prepare control latent embeds
+        if not bbox_images is None:
+            cond_latents = self._encode_vae_condition(bbox_images,
+                                                device,
+                                                num_videos_per_prompt,
+                                                self.do_classifier_free_guidance)
+            image_latents[:,0:num_cond_bbox_frames,::] = cond_latents[:,0:num_cond_bbox_frames,::]
+            image_latents[:,-1,::]=cond_latents[:,-1,::]
+        # 5. Get Added Time IDs
+        added_time_ids = self._get_add_time_ids(
+            fps,
+            motion_bucket_id,
+            noise_aug_strength,
+            image_embeddings.dtype,
+            batch_size,
+            num_videos_per_prompt,
+            self.do_classifier_free_guidance,
+        )
+        added_time_ids = added_time_ids.to(device)
+        # 6. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+        # 7. Prepare latent variables
+        num_channels_latents = self.unet.config.out_channels*2
+        latents = self.prepare_latents(
+            batch_size * num_videos_per_prompt,
+            num_frames,
+            num_channels_latents,
+            height,
+            width,
+            image_embeddings.dtype,
+            device,
+            generator,
+            latents,
+        )
+        # 8. Prepare guidance scale
+        guidance_scale = torch.linspace(min_guidance_scale, max_guidance_scale, num_frames).unsqueeze(0)
+        guidance_scale = guidance_scale.to(device, latents.dtype)
+        guidance_scale = guidance_scale.repeat(batch_size * num_videos_per_prompt, 1)
+        guidance_scale = _append_dims(guidance_scale, latents.ndim)
+        self._guidance_scale = guidance_scale
+        # 9. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                # Concatenate image_latents over channels dimension
+                latent_model_input = torch.cat([latent_model_input, image_latents], dim=2)
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=image_embeddings,
+                    added_time_ids=added_time_ids,
+                    return_dict=False,
+                )[0]
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_cond = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_cond - noise_pred_uncond)
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents).prev_sample
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                    latents = callback_outputs.pop("latents", latents)
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+        if not output_type == "latent":
+            frames = self.decode_latents(latents, num_frames, decode_chunk_size)
+            frames = torch.clamp(frames, -1, 1)
+            # not sure why these codes were here
+            # for i in range(frames.shape[2]):
+            #     frame = frames[:, :, i]
+            #     if frame.min() > -0.9:
+            #         frames[:,:,i] = torch.zeros_like(frame)
+            frames = tensor2vid(frames, self.image_processor, output_type=output_type)
+        else:
+            frames = latents
+        # cast back to fp16 if needed
+        if needs_upcasting:
+            self.vae.to(dtype=torch.float16)
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return frames
+        return StableVideoDiffusionPipelineOutput(frames=frames)

src/preprocess/README.md ADDED Viewed

	@@ -0,0 +1,105 @@

+# Video Dataset Processing Tools
+This directory contains tools for processing and filtering video datasets. There are two main types of tools:
+1. Dataset Preprocessing Tools (`preprocess_*.py`)
+2. Dataset Filtering Tool (`filter_dataset_tool.py`)
+## Dataset Preprocessing Tools
+These scripts process raw video datasets (DADA2000, CAP, and Russia Car Crash) by:
+- Extracting frames at specified FPS
+- Cropping frames to desired dimensions
+- Generating object detection labels
+- Creating train/val splits
+### Usage
+Basic usage with default settings:
+```bash
+# For DADA2000 dataset
+python preprocess_dada_dataset.py
+# For CAP dataset
+python preprocess_cap_dataset.py
+# For Russia Car Crash dataset
+python preprocess_russia_dataset.py
+```
+Advanced usage with custom settings:
+```bash
+python preprocess_dada_dataset.py \
+    --dataset_root /path/to/datasets \
+    --dataset_dir /path/to/raw/dataset \
+    --out_directory /path/to/output \
+    --out_fps 15 \
+    --skip_extraction \
+    --skip_labels \
+    --skip_split
+```
+### Common Arguments
+- `--dataset_root`: Root directory for datasets
+- `--dataset_dir`: Directory containing the raw dataset
+- `--out_directory`: Output directory (defaults to {dataset_root}/dataset_name)
+- `--skip_extraction`: Skip frame extraction step
+- `--skip_labels`: Skip label generation step
+- `--skip_split`: Skip train/val split step
+### Dataset-Specific Arguments
+- DADA2000:
+  - `--out_fps`: Output frames per second (default: 12)
+- CAP:
+  - `--reverse`: Process samples in reverse order
+- Russia:
+  - `--process_train`: Process training set (default is validation set only)
+## Dataset Filtering Tool
+A tool for manually reviewing and filtering video datasets. It provides an interactive interface to review video frames and mark them as high quality or rejected. The tool can also automatically detect upscaled videos and scene changes to help with the filtering process.
+### Features
+- Interactive video frame review with keyboard controls
+- Automatic detection of upscaled videos
+- Scene change detection
+- Caching support for faster processing
+- Support for both single-category and multi-category datasets
+### Usage
+Basic usage with default settings:
+```bash
+python filter_dataset_tool.py --dataset_name my_dataset
+```
+Advanced usage with all features enabled:
+```bash
+python filter_dataset_tool.py \
+    --dataset_name my_dataset \
+    --start_idx 0 \
+    --data_dir ./custom/path/to/images \
+    --output_root ./custom/output/path \
+    --use_cache
+```
+### Keyboard Controls
+- `w`: Next frame
+- `s`: Previous frame
+- `d`: Next video
+- `a`: Previous video
+- `r`: Reject video
+- `h`: Mark as high quality
+- `p`: Increase playback speed
+- `l`: Decrease playback speed
+- `ESC`: Exit
+### Command Line Arguments
+- `--dataset_name`: Name of the dataset directory (required)
+- `--start_idx`: Starting index for video review (default: 0)
+- `--data_dir`: Custom data directory path (default: ./{dataset_name}/images)
+- `--output_root`: Custom output root directory (default: ./{dataset_name})
+- `--disable_sort_by_upsample`: Disable sorting by upsampling factor
+- `--disable_check_scene_changes`: Disable scene change detection
+- `--single_category`: Process videos from a single category directory
+- `--use_cache`: Use cache to speed up processing

src/preprocess/filter_dataset_tool.py ADDED Viewed

	@@ -0,0 +1,315 @@

+import os
+import cv2
+import json
+import numpy as np
+from time import time
+import glob
+from tqdm import tqdm
+import scenedetect as sd
+import argparse
+# Load existing JSON data if available
+def load_json(filename):
+    if os.path.exists(filename):
+        with open(filename, "r") as f:
+            return json.load(f)
+    return []
+def save_json(filename, data):
+    with open(filename, "w") as f:
+        json.dump(data, f, indent=4)
+def estimate_upsizing_factor(image_path):
+    """Estimate how much an image was upsized before being resized to 720x1280"""
+    # Load image in grayscale
+    img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
+    if img is None:
+        print(f"Error loading image: {image_path}")
+        return None
+    # Compute the 2D Fourier Transform
+    f = np.fft.fft2(img)
+    fshift = np.fft.fftshift(f)  # Center the low frequencies
+    magnitude_spectrum = np.abs(fshift)
+    # Compute high-frequency energy
+    h, w = img.shape
+    cx, cy = w // 2, h // 2  # Center of the image
+    radius = min(cx, cy) // 4  # Define a region for high frequencies
+    # Mask low frequencies (keep only high frequencies)
+    mask = np.zeros((h, w), np.uint8)
+    cv2.circle(mask, (cx, cy), radius, 1, thickness=-1)
+    high_freq_energy = np.sum(magnitude_spectrum * (1 - mask))
+    # Normalize energy by image size
+    energy_score = high_freq_energy / (h * w)
+    # Estimate how much the image was upscaled
+    upsize_factor = 1 / (1 + energy_score)  # Inverse relation: lower energy → more upscaling
+    return upsize_factor
+def check_upsample(video_paths, output_root, use_cache=True):
+    t = time()
+    if use_cache:
+        cache_file = f"{output_root}/upsample_scores.json"
+        cached_data = load_json(cache_file)
+    results = {}
+    num_frames = 5
+    for src_images in tqdm(video_paths, desc="Computing upscale"):
+        vid_name = src_images.split('/')[-1]
+        if use_cache and vid_name in cached_data:
+            results[src_images] = cached_data[vid_name]
+            continue
+        all_images = sorted(glob.glob(f"{src_images}/*.jpg"))
+        if len(all_images) < 5:
+            continue
+        frame_indices = np.linspace(0, len(all_images) - 1, num_frames).astype(int)
+        vid_scores = []
+        for frame_idx in frame_indices:
+            image_path = all_images[frame_idx]
+            upsize_factor = estimate_upsizing_factor(image_path)
+            # print(image_dir, upsize_factor)
+            vid_scores.append(upsize_factor)
+        results[src_images] = np.median(vid_scores).item()
+    sorted_results = sorted(results.items(), key=lambda x: x[1], reverse=True)
+    sorted_results = {k: v for k, v in sorted_results}
+    if use_cache:
+        sorted_vids_by_names = {k.split('/')[-1]: v for k, v in sorted_results.items()}
+        save_json(cache_file, sorted_vids_by_names)
+    # print(f"Done in {time()-t:.2f}s")
+    return sorted_results
+def detect_scenes(image_folder, threshold=27.0):
+    """Detects scene changes in a folder of images using PySceneDetect."""
+    image_files = [os.path.join(image_folder, f) for f in sorted(os.listdir(image_folder)) if f.lower().endswith(('.jpg', '.jpeg'))]
+    detector = sd.detectors.ContentDetector(threshold=threshold)
+    scene_list = []
+    prev_frame = None
+    frame_num = 0
+    for image_idx in range(0, len(image_files), 2): # Skip frames to go faster
+        image_file = image_files[image_idx]
+        frame = cv2.imread(image_file)
+        if frame is None:
+            continue
+        frame_num += 1
+        if prev_frame is not None:
+            if detector.process_frame(frame_num, frame):
+                scene_list.append(frame_num)
+        prev_frame = frame
+    return scene_list
+def scan_scene_changes(video_paths, output_root, use_cache=True):
+    if use_cache:
+        cache_file = f"{output_root}/scene_changes.json"
+        cached_data = load_json(cache_file)
+    all_scene_change_vids = []
+    scene_changes_by_vid_name = {}
+    for folder_path in tqdm(video_paths, desc="Detecting scene changes"):
+        vid_name = folder_path.split('/')[-1]
+        if use_cache and vid_name in cached_data:
+            scene_changes = cached_data[vid_name]
+        else:
+            scene_changes = detect_scenes(folder_path)
+        scene_changes_by_vid_name[vid_name] = scene_changes
+        if len(scene_changes) > 0:
+            # print(f"{folder_path.split('/')[-1]} scene changes:", scene_changes)
+            all_scene_change_vids.append(folder_path)
+    if use_cache:
+        save_json(cache_file, scene_changes_by_vid_name)
+    print("Scene change vids:", len(all_scene_change_vids))
+    return all_scene_change_vids
+def sort_tool(video_folders, rejected_file, highquality_file, start_idx=0):
+    rejected_videos = load_json(rejected_file)
+    highquality_videos = load_json(highquality_file)
+    rejected_videos_count = 0
+    for video_path in video_folders:
+        video_name = video_path.split("/")[-1]
+        if video_name in rejected_videos:
+            rejected_videos_count += 1
+    print(f"{rejected_videos_count}/{len(video_folders)} videos already rejected in this set")
+    video_idx = start_idx
+    frame_idx = 0
+    fps = 12
+    last_action_next = True
+    while True:
+        video_path = video_folders[video_idx]
+        video_name = video_path.split("/")[-1]
+        image_files = sorted([f for f in os.listdir(video_path) if f.endswith(".jpg")])
+        if not image_files:
+            print(f"No images found in {video_name}")
+            if last_action_next:
+                video_idx = (video_idx + 1) % len(video_folders)
+            else:
+                video_idx = (video_idx - 1) % len(video_folders)
+            continue
+        if video_name in rejected_videos or video_name in highquality_videos:
+            print(f"{video_name} already filtered")
+            if last_action_next:
+                video_idx = (video_idx + 1) % len(video_folders)
+            else:
+                video_idx = (video_idx - 1) % len(video_folders)
+            continue
+        frame_idx = 0
+        playing = True
+        paused = False
+        while playing:
+            frame_path = os.path.join(video_path, image_files[frame_idx])
+            frame = cv2.imread(frame_path)
+            if frame is None:
+                print(f"Failed to load {frame_path}")
+                continue
+            display_text = f"Video: {video_name} ({video_idx}/{len(video_folders)}) | Frame: {frame_idx + 1}/{len(image_files)} | fps: {fps}"
+            cv2.putText(frame, display_text, (20, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)
+            cv2.imshow("Video Reviewer", frame)
+            key = cv2.waitKey(int(1000 / fps))  # 12 FPS
+            if key == ord('w'):  # Next frame
+                frame_idx = min(len(image_files)-1, frame_idx + 1)
+                paused = True
+            elif key == ord('s'):  # Previous frame
+                frame_idx = max(0, frame_idx - 1)
+                paused = True
+            elif key == ord('d'):  # Next video
+                video_idx = (video_idx + 1) % len(video_folders)
+                last_action_next = True
+                break
+            elif key == ord('a'):  # Previous video
+                video_idx = (video_idx - 1) % len(video_folders)
+                last_action_next = False
+                break
+            elif key == ord('r'):  # Reject video
+                if video_name not in rejected_videos:
+                    rejected_videos.append(video_name)
+                    save_json(rejected_file, rejected_videos)
+                print(f"Rejected: {video_name}")
+                video_idx = (video_idx + 1) % len(video_folders)
+                break
+            elif key == ord('h'):  # Mark as high quality
+                if video_name not in highquality_videos:
+                    highquality_videos.append(video_name)
+                    save_json(highquality_file, highquality_videos)
+                print(f"High Quality: {video_name}")
+                video_idx = (video_idx + 1) % len(video_folders)
+                break
+            elif key == ord('p'):  # Increase fps
+                fps += 1
+            elif key == ord('l'): # Lower fps
+                fps = max(1, fps - 1)
+            elif key == 27:  # ESC to exit
+                playing = False
+                break
+            if not paused:
+                frame_idx = (frame_idx + 1) % len(image_files)
+        if key == 27:
+            print(f"Last video: {video_name} ({video_idx})")
+            break
+    cv2.destroyAllWindows()
+def collect_all_videos(data_dir, single_category=False):
+    all_video_paths = []
+    if single_category:
+        all_video_paths = sorted(glob.glob(f"{data_dir}/*"))
+    else:
+        for category in sorted(os.listdir(data_dir)):
+            category_path = os.path.join(data_dir, category)
+            if os.path.isdir(category_path):
+                all_video_paths.extend(sorted(glob.glob(f"{category_path}/*")))
+    return all_video_paths
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Filter and sort video dataset')
+    parser.add_argument('--dataset_name', type=str, required=True,
+                      help='Name of the dataset directory')
+    parser.add_argument('--start_idx', type=int, default=0,
+                      help='Starting index for video review')
+    parser.add_argument('--data_dir', type=str, default=None,
+                      help='Custom data directory path (defaults to ./{dataset_name}/images)')
+    parser.add_argument('--output_root', type=str, default=None,
+                      help='Custom output root directory (defaults to ./{dataset_name})')
+    parser.add_argument('--disable_sort_by_upsample', action='store_true',
+                      help='Disable sorting videos by upsampling factor')
+    parser.add_argument('--disable_check_scene_changes', action='store_true',
+                      help='Disable checking for scene changes in videos')
+    parser.add_argument('--single_category', action='store_true',
+                      help='Process videos from a single category directory')
+    parser.add_argument('--use_cache', action='store_true',
+                      help='Use cache to speed up processing')
+    args = parser.parse_args()
+    # Set default paths if not specified
+    if args.data_dir is None:
+        args.data_dir = f"./{args.dataset_name}/images"
+    if args.output_root is None:
+        args.output_root = f"./{args.dataset_name}"
+    # Output JSON files
+    rejected_file = f"{args.output_root}/rejected.json"
+    auto_low_quality = f"{args.output_root}/auto_low_quality.json"
+    highquality_file = f"{args.output_root}/highquality.json"
+    all_video_paths = collect_all_videos(args.data_dir, args.single_category)
+    if not args.disable_sort_by_upsample:
+        sorted_vids = check_upsample(all_video_paths, args.output_root, use_cache=args.use_cache)
+        video_folders = list(sorted_vids.keys())
+        # Save the worst to file
+        # auto_reject_vids = [v.split('/')[-1] for v in video_folders[:2000]]
+        # save_json(auto_low_quality, auto_reject_vids)
+    else:
+        video_folders = all_video_paths
+    # Prepend scene change samples
+    if not args.disable_check_scene_changes:
+        new_video_folders = []
+        scene_change_vids = scan_scene_changes(all_video_paths, args.output_root, use_cache=args.use_cache)
+        new_video_folders.extend(scene_change_vids)
+        for vid_name in video_folders:
+            if vid_name not in new_video_folders:
+                new_video_folders.append(vid_name)
+        video_folders = new_video_folders
+    # Start tool
+    sort_tool(video_folders, rejected_file, highquality_file, start_idx=args.start_idx)

src/preprocess/preprocess_cap_dataset.py ADDED Viewed

	@@ -0,0 +1,224 @@

+import os
+import cv2
+import json
+from tqdm import tqdm
+from glob import glob
+import argparse
+from yolo_sam import YoloSamProcessor
+def load_json(filename):
+    if os.path.exists(filename):
+        with open(filename, "r") as f:
+            return json.load(f)
+    print(filename, "not found")
+    return []
+def create_video(sample, video_path):
+    video_filename = f"{video_path}.mp4"
+    FPS = 12
+    frame_size = (1056, 660)#(512, 320)
+    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+    video_writer_out = cv2.VideoWriter(video_filename, fourcc, FPS, frame_size)
+    for img in sample:
+        video_writer_out.write(img)
+    video_writer_out.release()
+    print(f"Video saved: {video_filename}")
+def crop_images(images_dir_path, output_dir_path, crop_extents=None):
+    """
+    Crop frames
+    """
+    all_images = sorted(glob(f"{images_dir_path}/*.jpg"))
+    total_frames = len(all_images)
+    sample_image = cv2.imread(all_images[0])
+    sample_name = str(int(images_dir_path.split('/')[-2])).zfill(5)
+    sample_category = images_dir_path.split('/')[-3]
+    out_vid_name = f"{sample_category}_{sample_name}"
+    src_height, src_width = sample_image.shape[:2]
+    if crop_extents:
+        src_height, src_width = (src_height + crop_extents[1]) - crop_extents[0], (src_width + crop_extents[3]) - crop_extents[2]
+    # print(f"Source images '{out_vid_name}': {src_height}x{src_width}")
+    image_output_folder = os.path.join(output_dir_path, out_vid_name)
+    os.makedirs(image_output_folder, exist_ok=True)
+    out_frame_count = 0
+    # sample_test = []
+    for frame_idx in range(total_frames):
+        frame_path = all_images[frame_idx]
+        frame = cv2.imread(frame_path)
+        # Crop frame
+        if crop_extents:
+            frame = frame[crop_extents[0]:crop_extents[1], crop_extents[2]:crop_extents[3]]
+        # Save frame
+        out_image_name = f"{out_vid_name}_{str(frame_idx).zfill(4)}.jpg"
+        out_image_path = os.path.join(image_output_folder, out_image_name)
+        cv2.imwrite(out_image_path, frame)
+        # sample_test.append(frame)
+        out_frame_count += 1
+    print(f"Done '{out_vid_name}': {src_height}x{src_width}, {out_frame_count} frames")
+    # create_video(sample_test, "path/to/sample_test_vid", fps=6)
+def extract_frames(dataset_dir, out_directory, crop_extents=None, specific_videos=None):
+    # NOTE: We are excluding all crashes that involve visible humans (pedestrians, cyclists, motorbikes...)
+    video_types_to_exclude = [1, 2, 3, 4, 5, 6, 37, 38, 45, 46, 47, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60]
+    for category_dir in sorted(os.listdir(dataset_dir)):
+        category_dir_path = os.path.join(dataset_dir, category_dir)
+        if not os.path.isdir(category_dir_path):
+            continue
+        # Let's filter the videos we want right away
+        vid_type = int(category_dir)
+        if int(vid_type) in video_types_to_exclude:
+            continue
+        for vid_name in tqdm(sorted(os.listdir(category_dir_path))):
+            if specific_videos is not None and vid_name not in specific_videos:
+                continue
+            images_dir_path = os.path.join(category_dir_path, vid_name, "images")
+            out_path = os.path.join(out_directory, "images", category_dir)
+            crop_images(images_dir_path, out_path, crop_extents=crop_extents)
+    print("Extraction complete.")
+def generate_labels(out_directory, vid_names=None, subdir="", in_directory=None, reverse_order=False):
+    label_output_folder = os.path.join(out_directory, "labels", subdir)
+    os.makedirs(label_output_folder, exist_ok=True)
+    # Checkpoint paths
+    yolo_ckpt = "yolov8x.pt" # Will auto download with utltralytics
+    sam2_ckpt = "/network/scratch/x/xuolga/sam2/checkpoints/sam2.1_hiera_base_plus.pt"
+    sam2_cfg = "./configs/sam2.1/sam2.1_hiera_b+.yaml"
+    yolo_sam = YoloSamProcessor(yolo_ckpt, sam2_ckpt, sam2_cfg)
+    samples_run = 0
+    src_directory = in_directory if in_directory is not None else out_directory
+    video_dir_root = os.path.join(src_directory, "images", subdir)
+    for category in sorted(os.listdir(video_dir_root), reverse=reverse_order):
+        category_root = os.path.join(video_dir_root, category)
+        for video_name in tqdm(sorted(os.listdir(category_root), reverse=reverse_order)):
+            if vid_names is not None and video_name not in vid_names:
+                continue
+            video_dir = os.path.join(category_root, video_name)
+            if len(os.listdir(video_dir)) == 0:
+                print("Empty video dir:", video_dir)
+                continue
+            # Skip if label file already exists
+            out_label_path = os.path.join(label_output_folder, f"{video_name}.json")
+            if os.path.exists(out_label_path):
+                print(f"Skipping {video_name} - label file already exists")
+                continue
+            if len(os.listdir(video_dir)) > 300:
+                print(f"SKIPPING LONG VIDEO {video_name}")
+                continue
+            print(f"Computing bboxes for {video_name}...")
+            video_data = yolo_sam(video_dir, rel_bbox=True)
+            # Add metadata
+            vid_type = int(video_name.split('_')[0])
+            ego_involved = vid_type < 19 or vid_type == 61
+            final_out_data = {
+                "video_source": f"{video_name}.mp4",
+                "metadata": {
+                    "ego_involved": ego_involved,
+                    "accident_type": vid_type
+                    },
+                "data": video_data
+            }
+            with open(out_label_path, 'w') as json_file:
+                json.dump(final_out_data, json_file, indent=1)
+            print("Saved label:", out_label_path)
+            samples_run += 1
+            if samples_run > 50:
+                print("Resetting Yolo_Sam in case of memory leak")
+                del yolo_sam
+                yolo_sam = YoloSamProcessor(yolo_ckpt, sam2_ckpt, sam2_cfg)
+                samples_run = 0
+def make_train_val_split(out_directory):
+    image_folder = os.path.join(out_directory, "images")
+    label_folder = os.path.join(out_directory, "labels")
+    all_image_folders = os.listdir(image_folder)
+    split_idx = int(len(all_image_folders) * 0.9)
+    train_split = all_image_folders[:split_idx]
+    val_split = all_image_folders[split_idx:]
+    os.makedirs(os.path.join(image_folder, "train"), exist_ok=True)
+    os.makedirs(os.path.join(image_folder, "val"), exist_ok=True)
+    os.makedirs(os.path.join(label_folder, "train"), exist_ok=True)
+    os.makedirs(os.path.join(label_folder, "val"), exist_ok=True)
+    for filename in train_split:
+        os.rename(os.path.join(image_folder, filename), os.path.join(image_folder, "train", filename))
+        os.rename(os.path.join(label_folder, f"{filename}.json"), os.path.join(label_folder, "train", f"{filename}.json"))
+    for filename in val_split:
+        os.rename(os.path.join(image_folder, filename), os.path.join(image_folder, "val", filename))
+        os.rename(os.path.join(label_folder, f"{filename}.json"), os.path.join(label_folder, "val", f"{filename}.json"))
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Process CAP dataset')
+    parser.add_argument('--dataset_root', type=str, required=True,
+                      help='Root directory for datasets')
+    parser.add_argument('--dataset_dir', type=str, default="/network/scratch/l/luis.lara/dev/MM-AU/CAP-DATA",
+                      help='Directory containing the CAP dataset')
+    parser.add_argument('--out_directory', type=str, default=None,
+                      help='Output directory (defaults to {dataset_root}/cap_images_12fps)')
+    parser.add_argument('--reverse', action='store_true',
+                      help='Process samples in reverse order')
+    parser.add_argument('--skip_extraction', action='store_true',
+                      help='Skip frame extraction step')
+    parser.add_argument('--skip_labels', action='store_true',
+                      help='Skip label generation step')
+    parser.add_argument('--skip_split', action='store_true',
+                      help='Skip train/val split step')
+    args = parser.parse_args()
+    # Set default output directory if not specified
+    if args.out_directory is None:
+        args.out_directory = os.path.join(args.dataset_root, "cap_images_12fps")
+    # Extract frames from videos
+    if not args.skip_extraction:
+        cap_crop_extents = [40, -40, 128, -128]  # Custom crop for CAP dataset (get ratio right)
+        extract_frames(args.dataset_dir, args.out_directory, crop_extents=cap_crop_extents, specific_videos=None)
+    # Create labels (run bbox detector)
+    if not args.skip_labels:
+        in_directory = os.path.join(args.dataset_root, "cap_images_12fps")
+        generate_labels(args.out_directory, vid_names=None, reverse_order=args.reverse)
+    # Split into train and val sets
+    if not args.skip_split:
+        make_train_val_split(args.out_directory)

src/preprocess/preprocess_dada_dataset.py ADDED Viewed

	@@ -0,0 +1,222 @@

+import os
+import cv2
+import json
+from tqdm import tqdm
+import argparse
+from yolo_sam import YoloSamProcessor
+def create_video(sample, video_path):
+    video_filename = f"{video_path}.mp4"
+    FPS = 12
+    frame_size = (1056, 660)#(512, 320)
+    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+    video_writer_out = cv2.VideoWriter(video_filename, fourcc, FPS, frame_size)
+    for img in sample:
+        video_writer_out.write(img)
+    video_writer_out.release()
+    print(f"Video saved: {video_filename}")
+def downsample_and_crop_vid(video_path, output_dir, out_fps=12, crop_extents=None):
+    """
+    Downsample fps and crop frames
+    """
+    # Load video
+    cap = cv2.VideoCapture(video_path)
+    org_fps = int(cap.get(cv2.CAP_PROP_FPS))
+    src_width, src_height = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+    original_sample_name = video_path.split('/')[-1].split('.')[0]
+    category = original_sample_name.split("_")[0]
+    vid_num = '90'+str(int(original_sample_name.split("_")[-1])).zfill(3) # NOTE: We prepend a '90' to differentiate between DADA and CAP samples
+    sample_name = f"{category}_{vid_num}"
+    if crop_extents:
+        src_width, src_height = (src_width + crop_extents[3]) - crop_extents[2], (src_height + crop_extents[1]) - crop_extents[0]
+    print(f"Source video '{sample_name}': {src_width}x{src_height}, fps={org_fps}")
+    total_frames = 0
+    target_period = 1/out_fps
+    last_frame_time = target_period
+    out_frame_count = 0
+    image_output_folder = os.path.join(output_dir, "images", category, sample_name)
+    os.makedirs(image_output_folder, exist_ok=True)
+    # sample_test = []
+    while cap.isOpened():
+        success, frame = cap.read()
+        if not success:
+            break
+        # Extract frames according to desired fps
+        if last_frame_time >= target_period:
+            out_frame_count += 1
+            last_frame_time = (last_frame_time - target_period)
+            # Crop frame
+            if crop_extents:
+                frame = frame[:, crop_extents[2] : crop_extents[3]]
+            # Save frame
+            out_image_name = f"{sample_name}_{str(total_frames).zfill(4)}.jpg"
+            out_image_path = os.path.join(image_output_folder, out_image_name)
+            cv2.imwrite(out_image_path, frame)
+            # sample_test.append(frame)
+        total_frames += 1
+        last_frame_time += 1/org_fps
+    print(f"Done '{sample_name}': {out_frame_count} frames, fps: {out_frame_count / (total_frames*1/org_fps)}")
+    cap.release()
+    # create_video(sample_test, "/path/to/sample_test_vid")
+def extract_frames(dataset_dir, out_directory, crop_extents=None, out_fps=12):
+    dataset_video_dir = os.path.join(dataset_dir)
+    # NOTE: We are excluding all crashes that involve visible humans (pedestrians, cyclists, motorbikes...)
+    video_types_to_exclude = [1, 2, 3, 4, 5, 6, 37, 38, 45, 46, 47, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60]
+    for filename in tqdm(os.listdir(dataset_video_dir)):
+        if filename.split('.')[-1] != "mp4":
+            continue
+        # Let's filter the videos we want right away
+        vid_type = filename.split('_')[0]
+        if int(vid_type) in video_types_to_exclude:
+            continue
+        video_path = os.path.join(dataset_video_dir, filename)
+        downsample_and_crop_vid(video_path, out_directory, out_fps=out_fps, crop_extents=crop_extents)
+    print("Extraction complete.")
+def generate_labels(out_directory, vid_names=None, subdir=""):
+    label_output_folder = os.path.join(out_directory, "labels", subdir)
+    os.makedirs(label_output_folder, exist_ok=True)
+    # Checkpoint paths
+    yolo_ckpt = "yolov8x.pt" # Will auto download with utltralytics
+    sam2_ckpt = "/network/scratch/x/xuolga/sam2/checkpoints/sam2.1_hiera_base_plus.pt"
+    sam2_cfg = "./configs/sam2.1/sam2.1_hiera_b+.yaml"
+    yolo_sam = YoloSamProcessor(yolo_ckpt, sam2_ckpt, sam2_cfg)
+    samples_run = 0
+    video_dir_root = os.path.join(out_directory, "images", subdir)
+    for category in sorted(os.listdir(video_dir_root), reverse=True):
+        category_root = os.path.join(video_dir_root, category)
+        for video_name in tqdm(os.listdir(category_root)):
+            if vid_names is not None and video_name not in vid_names:
+                continue
+            video_dir = os.path.join(category_root, video_name)
+            if len(os.listdir(video_dir)) == 0:
+                print("Empty video dir:", video_dir)
+                continue
+            if len(os.listdir(video_dir)) > 300:
+                print(f"SKIPPING LONG VIDEO {video_name}")
+                continue
+            # Skip if label file already exists
+            out_label_path = os.path.join(label_output_folder, f"{video_name}.json")
+            if os.path.exists(out_label_path):
+                print(f"Skipping {video_name} - label file already exists")
+                continue
+            video_data = yolo_sam(video_dir, rel_bbox=True)
+            if video_data is None:
+                print("COMPUTED VIDEO DATA IS NULL for video:", video_dir)
+            # Add metadata
+            vid_type = int(video_name.split('_')[0])
+            ego_involved = vid_type < 19 or vid_type == 61
+            final_out_data = {
+                "video_source": f"{video_name}.mp4",
+                "metadata": {
+                    "ego_involved": ego_involved,
+                    "accident_type": vid_type
+                    },
+                "data": video_data
+            }
+            with open(out_label_path, 'w') as json_file:
+                json.dump(final_out_data, json_file, indent=1)
+            print("Saved label:", out_label_path)
+            samples_run += 1
+            if samples_run > 50:
+                print("Resetting Yolo_Sam in case of memory leak")
+                del yolo_sam
+                yolo_sam = YoloSamProcessor(yolo_ckpt, sam2_ckpt, sam2_cfg)
+                samples_run = 0
+def make_train_val_split(out_directory):
+    image_folder = os.path.join(out_directory, "images")
+    label_folder = os.path.join(out_directory, "labels")
+    all_image_folders = os.listdir(image_folder)
+    split_idx = int(len(all_image_folders) * 0.9)
+    train_split = all_image_folders[:split_idx]
+    val_split = all_image_folders[split_idx:]
+    os.makedirs(os.path.join(image_folder, "train"), exist_ok=True)
+    os.makedirs(os.path.join(image_folder, "val"), exist_ok=True)
+    os.makedirs(os.path.join(label_folder, "train"), exist_ok=True)
+    os.makedirs(os.path.join(label_folder, "val"), exist_ok=True)
+    for filename in train_split:
+        os.rename(os.path.join(image_folder, filename), os.path.join(image_folder, "train", filename))
+        os.rename(os.path.join(label_folder, f"{filename}.json"), os.path.join(label_folder, "train", f"{filename}.json"))
+    for filename in val_split:
+        os.rename(os.path.join(image_folder, filename), os.path.join(image_folder, "val", filename))
+        os.rename(os.path.join(label_folder, f"{filename}.json"), os.path.join(label_folder, "val", f"{filename}.json"))
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Process DADA2000 dataset')
+    parser.add_argument('--dataset_root', type=str, required=True,
+                      help='Root directory for datasets')
+    parser.add_argument('--dataset_dir', type=str, required=True,
+                      help='Directory containing the DADA2000 dataset')
+    parser.add_argument('--out_directory', type=str, default=None,
+                      help='Output directory (defaults to {dataset_root}/dada2000_images_12fps)')
+    parser.add_argument('--skip_extraction', action='store_true',
+                      help='Skip frame extraction step')
+    parser.add_argument('--skip_labels', action='store_true',
+                      help='Skip label generation step')
+    parser.add_argument('--skip_split', action='store_true',
+                      help='Skip train/val split step')
+    parser.add_argument('--out_fps', type=int, default=12,
+                      help='Output frames per second (default: 12)')
+    args = parser.parse_args()
+    # Set default output directory if not specified
+    if args.out_directory is None:
+        args.out_directory = os.path.join(args.dataset_root, "dada2000_images_12fps")
+    # Extract frames from videos
+    if not args.skip_extraction:
+        dada_crop_extents = [0, -0, 264, -264]  # Custom crop for DADA2000 dataset (get ratio right)
+        extract_frames(args.dataset_dir, args.out_directory, crop_extents=dada_crop_extents, out_fps=args.out_fps)
+    # Create labels (run bbox detector)
+    if not args.skip_labels:
+        generate_labels(args.out_directory, vid_names=None)
+    # Split into train and val sets
+    if not args.skip_split:
+        make_train_val_split(args.out_directory)

src/preprocess/preprocess_russia_dataset.py ADDED Viewed

	@@ -0,0 +1,168 @@

+import os
+import cv2
+import json
+from tqdm import tqdm
+import argparse
+from yolo_sam import YoloSamProcessor
+def downsample_and_crop_vid(video_path, output_dir, out_fps=7, crop_extents=None):
+    """
+    Downsample fps and crop frames
+    """
+    # Load video
+    cap = cv2.VideoCapture(video_path)
+    org_fps = int(cap.get(cv2.CAP_PROP_FPS))
+    src_width, src_height = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+    sample_name = video_path.split('/')[-1].split('.')[0]
+    if crop_extents:
+        src_width, src_height = (src_width + crop_extents[3]) - crop_extents[2], (src_height + crop_extents[1]) - crop_extents[0]
+    print(f"Source video '{sample_name}': {src_width}x{src_height}, fps={org_fps}")
+    total_frames = 0
+    target_period = 1/out_fps
+    last_frame_time = target_period
+    out_frame_count = 0
+    image_output_folder = os.path.join(output_dir, "images", sample_name)
+    os.makedirs(image_output_folder, exist_ok=True)
+    while cap.isOpened():
+        success, frame = cap.read()
+        if not success:
+            break
+        # Extract frames according to desired fps
+        if last_frame_time >= target_period:
+            out_frame_count += 1
+            last_frame_time = (last_frame_time - target_period)
+            # Crop frame
+            if crop_extents:
+                frame = frame[crop_extents[0] : crop_extents[1], crop_extents[2] : crop_extents[3]]
+            # Save frame
+            out_image_name = f"{sample_name}_{str(total_frames).zfill(4)}.jpg"
+            out_image_path = os.path.join(image_output_folder, out_image_name)
+            cv2.imwrite(out_image_path, frame)
+        total_frames += 1
+        last_frame_time += 1/org_fps
+    print(f"Done '{sample_name}': {out_frame_count} frames, fps: {out_frame_count / (total_frames*1/org_fps)}")
+    cap.release()
+def extract_frames(dataset_dir, out_directory, crop_extents=None):
+    dataset_video_dir = os.path.join(dataset_dir, "video")
+    for filename in tqdm(os.listdir(dataset_video_dir)):
+        video_path = os.path.join(dataset_video_dir, filename)
+        fps = 7
+        downsample_and_crop_vid(video_path, out_directory, out_fps=fps, crop_extents=crop_extents)
+    print("Extraction complete.")
+def generate_labels(dataset_dir, out_directory, video_subdir=''):
+    label_output_folder = os.path.join(out_directory, "labels", video_subdir)
+    os.makedirs(label_output_folder, exist_ok=True)
+    # Checkpoint paths
+    yolo_ckpt = "yolov8x.pt" # Will auto download with utltralytics
+    sam2_ckpt = "/network/scratch/x/xuolga/sam2/checkpoints/sam2.1_hiera_base_plus.pt"
+    sam2_cfg = "./configs/sam2.1/sam2.1_hiera_b+.yaml"
+    yolo_sam = YoloSamProcessor(yolo_ckpt, sam2_ckpt, sam2_cfg)
+    video_dir_root = os.path.join(out_directory, "images", video_subdir)
+    # for video_name in tqdm(os.listdir(video_dir_root)):
+    for video_name in tqdm(["w10_138", "w10_94", "w1_10", "w1_46", "w2_79", "w3_17", "w6_14", "w6_44", "w6_78", "w6_94", "w7_1", "w7_14"]):
+        video_dir = os.path.join(video_dir_root, video_name)
+        if len(os.listdir(video_dir)) == 0:
+            print("Empty video dir:", video_dir)
+            continue
+        video_data = yolo_sam(video_dir, rel_bbox=True)
+        # Add metadata
+        org_dataset_labels = os.path.join(dataset_dir, "label", "json")
+        orig_label_path = os.path.join(org_dataset_labels, f"{video_name}.json")
+        with open(orig_label_path, 'r') as json_file:
+            metadata = json.load(json_file)[0]['meta_data']
+        final_out_data = {
+            "video_source": f"{video_name}.mp4",
+            "metadata": metadata,
+            "data": video_data
+        }
+        out_label_path = os.path.join(label_output_folder, f"{video_name}.json")
+        with open(out_label_path, 'w') as json_file:
+            json.dump(final_out_data, json_file, indent=1)
+        print("Saved label:", out_label_path)
+def make_train_val_split(out_directory):
+    image_folder = os.path.join(out_directory, "images")
+    label_folder = os.path.join(out_directory, "labels")
+    all_image_folders = os.listdir(image_folder)
+    split_idx = int(len(all_image_folders) * 0.9)
+    train_split = all_image_folders[:split_idx]
+    val_split = all_image_folders[split_idx:]
+    os.makedirs(os.path.join(image_folder, "train"), exist_ok=True)
+    os.makedirs(os.path.join(image_folder, "val"), exist_ok=True)
+    os.makedirs(os.path.join(label_folder, "train"), exist_ok=True)
+    os.makedirs(os.path.join(label_folder, "val"), exist_ok=True)
+    for filename in train_split:
+        os.rename(os.path.join(image_folder, filename), os.path.join(image_folder, "train", filename))
+        os.rename(os.path.join(label_folder, f"{filename}.json"), os.path.join(label_folder, "train", f"{filename}.json"))
+    for filename in val_split:
+        os.rename(os.path.join(image_folder, filename), os.path.join(image_folder, "val", filename))
+        os.rename(os.path.join(label_folder, f"{filename}.json"), os.path.join(label_folder, "val", f"{filename}.json"))
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Process Russia Car Crash dataset')
+    parser.add_argument('--dataset_root', type=str, required=True,
+                      help='Root directory for datasets')
+    parser.add_argument('--dataset_dir', type=str, required=True,
+                      help='Directory containing the Russia Car Crash dataset')
+    parser.add_argument('--out_directory', type=str, default=None,
+                      help='Output directory (defaults to {dataset_root}/preprocess_russia_crash)')
+    parser.add_argument('--skip_extraction', action='store_true',
+                      help='Skip frame extraction step')
+    parser.add_argument('--skip_labels', action='store_true',
+                      help='Skip label generation step')
+    parser.add_argument('--skip_split', action='store_true',
+                      help='Skip train/val split step')
+    parser.add_argument('--process_train', action='store_true',
+                      help='Process training set (default is validation set only)')
+    args = parser.parse_args()
+    # Set default output directory if not specified
+    if args.out_directory is None:
+        args.out_directory = os.path.join(args.dataset_root, "preprocess_russia_crash")
+    # Custom crop for Russia dataset (hide largest watermarks)
+    src_height, src_width = 986, 555
+    russia_crop_extents = [int(0.032*src_height), -int(0.198*src_height), int(0.115*src_width), -int(0.115*src_width)]
+    # Extract frames from videos
+    if not args.skip_extraction:
+        extract_frames(args.dataset_dir, args.out_directory, crop_extents=russia_crop_extents)
+    # Create labels (run bbox detector)
+    if not args.skip_labels:
+        generate_labels(args.dataset_dir, args.out_directory, video_subdir='val')
+        if args.process_train:
+            generate_labels(args.dataset_dir, args.out_directory, video_subdir='train')
+    # Split into train and val sets
+    if not args.skip_split:
+        make_train_val_split(args.out_directory)

src/preprocess/yolo_sam.py ADDED Viewed

	@@ -0,0 +1,584 @@

+import os
+import numpy as np
+import matplotlib.pyplot as plt
+from PIL import Image
+import torch
+import random
+from itertools import combinations
+from tqdm import tqdm
+import json
+import cv2
+from ultralytics import YOLO
+from sam2.build_sam import build_sam2_video_predictor
+NUM_LOOK_BACK_FRAMES = 3
+FPS = 12
+CLASSES_TO_KEEP = { # Using YOLO ids
+    0: 'person',
+    1: 'bicycle',
+    2: 'car',
+    3: 'motorcycle',
+    5: 'bus',
+    6: 'train',
+    7: 'truck',
+ }
+def create_video_from_images(images_dir, output_video, out_fps, start_frame=None, end_frame=None):
+    images = sorted(os.listdir(images_dir))
+    img0_path = os.path.join(images_dir, images[0])
+    img0 = cv2.imread(img0_path)
+    height, width, _ = img0.shape
+    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+    out = cv2.VideoWriter(output_video, fourcc, out_fps, (width, height))
+    for idx, frame_name in enumerate(images):
+        if start_frame is not None and idx < start_frame:
+            continue
+        if end_frame is not None and idx >= end_frame:
+            continue
+        img = cv2.imread(os.path.join(images_dir, frame_name))
+        out.write(img)
+    out.release()
+    print("Saved video:", output_video)
+def show_mask(mask, ax, obj_id=None, random_color=False, label=None):
+    if random_color:
+        color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0)
+    else:
+        cmap = plt.get_cmap("tab10")
+        cmap_idx = 0 if obj_id is None else obj_id
+        color = np.array([*cmap(cmap_idx)[:3], 0.6])
+    h, w = mask.shape[-2:]
+    mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1)
+    if label is not None:
+        text_location = mask.nonzero()
+        if len(text_location[0]) > 0:
+            rand_point = random.randint(0, len(text_location[0]) - 1)
+            ax.text(text_location[2][rand_point], text_location[1][rand_point], label, color=(1, 1, 1))
+    ax.imshow(mask_image)
+def show_box(box, ax, label=None, color=((1, 0.7, 0.7))):
+    x0, y0 = box[0], box[1]
+    w, h = box[2] - box[0], box[3] - box[1]
+    ax.add_patch(plt.Rectangle((x0, y0), w, h, edgecolor=color, facecolor=(0, 0, 0, 0), lw=1))
+    if label is not None:
+        ax.text(x0 + w // 2, y0 + h // 2, label, color=color)
+class TrackedObject:
+    def __init__(self, track_id, class_id, bbox, initial_frame_idx):
+        self.track_id = track_id
+        self.bbox = bbox
+        self.class_pred_counts = {class_id: 1}
+        self.initial_frame_idx = initial_frame_idx
+        self._top_class = class_id
+    @property
+    def class_id(self):
+        """
+        The class for the object is whichever class was predicted the most for it
+        """
+        if self._top_class is not None:
+            return self._top_class
+        top_class = None
+        top_count = 0
+        for class_id, count in self.class_pred_counts.items():
+            if count >= top_count:
+                top_count = count
+                top_class = class_id
+        self._top_class = top_class
+        return top_class
+    def new_pred(self, class_id):
+        if class_id not in self.class_pred_counts:
+            self.class_pred_counts[class_id] = 0
+        self.class_pred_counts[class_id] += 1
+        self._top_class = None  # Remove cached top_class
+    def __repr__(self):
+        return f"id:{self.track_id}, class:{self.class_id}, bbox:{self.bbox}, init_frame:{self.initial_frame_idx}"
+    def __str__(self):
+        return f"id:{self.track_id}, class:{self.class_id}, bbox:{self.bbox}, init_frame:{self.initial_frame_idx}"
+class YoloSamProcessor():
+    def __init__(self, yolo_ckpt, sam_ckpt, sam_cfg, gpu_id=0):
+        # Load models
+        os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
+        self.device = f"cuda:{gpu_id}"
+        self.yolo_model = YOLO(yolo_ckpt)
+        self.sam_model = build_sam2_video_predictor(sam_cfg, sam_ckpt, device=self.device)
+    def __call__(self, video_dir, rel_bbox=True):
+        # Renaming the videos to be compatible with SAM2
+        prev_frame_paths = {}
+        src_width, src_height = None, None
+        self.num_frames = len(os.listdir(video_dir))
+        for vid in os.listdir(video_dir):
+            new_name = vid.split("_")[-1]
+            og_path = os.path.join(video_dir, vid)
+            new_path = os.path.join(video_dir, new_name)
+            os.rename(og_path, new_path)
+            prev_frame_paths[new_path] = og_path
+            if src_width is None:
+                img_sample = Image.open(new_path)
+                src_width, src_height = img_sample.size
+        self.out_data = None
+        try:
+            # YOLO model
+            self.run_yolo(video_dir)
+            self.filter_yolo_preds()
+            # SAM2 model
+            self.run_sam(video_dir)
+            self.filter_sam_preds()
+            self.extract_final_bboxes()
+            # Format the data
+            self.out_data = []
+            for frame_idx, frame_data in enumerate(self.final_bboxes):
+                frame_path = self.frame_path_by_idx[frame_idx]
+                frame_name = prev_frame_paths[frame_path].split('/')[-1]
+                self.out_data.append({
+                    "image_source": frame_name,
+                    "labels": []
+                })
+                for obj_id, bbox in frame_data.items():
+                    bbox = bbox.tolist()
+                    if rel_bbox:
+                        # Save bbox coordinates as a ratio to image size
+                        formatted_bbox = [bbox[0]/src_width, bbox[1]/src_height, bbox[2]/src_width, bbox[3]/src_height]
+                    else:
+                        # Keep in absolute coordinates
+                        formatted_bbox = bbox
+                    tracked_obj = self.initial_bboxes[obj_id]
+                    out_label = {
+                        "track_id": obj_id,
+                        "name": CLASSES_TO_KEEP[tracked_obj.class_id],
+                        "class": tracked_obj.class_id,
+                        "box": formatted_bbox,
+                    }
+                    self.out_data[frame_idx]["labels"].append(out_label)
+        except Exception as e:
+            print("Yolo_Sam processor failed:", e)
+        finally:
+            # Revert back the names of the files
+            print("Restoring names in video dir after exception")
+            for new_path, old_path in prev_frame_paths.items():
+                os.rename(new_path, old_path)
+        return self.out_data
+    def run_yolo(self, video_dir):
+        self.initial_bboxes = {}  # Store the first bbox for each track id
+        self.id_reassigns = {}
+        all_preds_by_track_id = []
+        self.frame_path_by_idx = {}
+        # Reset yolo's tracker for new video
+        if self.yolo_model.predictor is not None:
+            self.yolo_model.predictor.trackers[0].reset()
+        new_id_counter = 1
+        sorted_frames = sorted(os.listdir(video_dir))
+        for frame_idx, frame_file in enumerate(sorted_frames):
+            frame_path = os.path.join(video_dir, frame_file)
+            self.frame_path_by_idx[frame_idx] = frame_path
+            img = Image.open(frame_path)
+            yolo_results = self.yolo_model.track(img, persist=True, conf=0.1, verbose=False, device=self.device)
+            yolo_boxes = yolo_results[0].boxes
+            all_preds_by_track_id.append({})
+            # If the detection is has a new track id, then we record
+            if yolo_boxes.is_track:
+                for idx in range(len(yolo_boxes)):
+                    track_id = int(yolo_boxes.id[idx].item())
+                    bbox = yolo_boxes.xyxy[idx].numpy()
+                    class_id = int(yolo_boxes.cls[idx].item())
+                    tracked_obj = self.initial_bboxes.get(track_id)
+                    # Check if YOLO is trying to assign a id that was already predicted but was not present in the previous frame
+                    #   This means YOLO lost this obj and is attempting to reassign. We will assign a new id for this as we want to
+                    #   trust SAM with the tracking not YOLO.
+                    prev_track_id = track_id
+                    if tracked_obj is not None and frame_idx > 0 and track_id not in all_preds_by_track_id[frame_idx - 1]:
+                        # Check if id has been reassigned
+                        if self.id_reassigns.get(track_id) is not None:
+                            track_id = self.id_reassigns.get(track_id)
+                        if track_id not in all_preds_by_track_id[frame_idx - 1]: # Check again because id might have changed
+                            # Assign a new track id
+                            track_id = 100 + new_id_counter
+                            new_id_counter += 1
+                            self.id_reassigns[prev_track_id] = track_id
+                            tracked_obj = None
+                            # print(f"Frame: {frame_idx} re-assigned id {prev_track_id}->{track_id}")
+                    if tracked_obj is None:
+                        # Check overlap with existing bboxes to make sure this isn't a double detection
+                        reject_detection = False
+                        for idx2 in range(len(yolo_boxes)):
+                            track_id2 = int(yolo_boxes.id[idx2].item())
+                            if track_id2 in [track_id, prev_track_id] or self.initial_bboxes.get(track_id2) is None:
+                                continue
+                            bbox2 = yolo_boxes.xyxy[idx2].numpy()
+                            iou = self._bbox_iou(bbox, bbox2)
+                            if iou >= 0.8:
+                                reject_detection = True
+                                # print("Redetection! Frame:", frame_idx, "Iou:", iou, track_id, track_id2)
+                                break
+                        if not reject_detection:
+                            tracked_obj = TrackedObject(track_id, class_id, bbox, frame_idx)
+                            self.initial_bboxes[track_id] = tracked_obj
+                    else:
+                        tracked_obj.new_pred(class_id)
+                    if tracked_obj is not None:
+                        all_preds_by_track_id[frame_idx][track_id] = TrackedObject(track_id, class_id, bbox, tracked_obj.initial_frame_idx)
+    def filter_yolo_preds(self):
+        # Smooth classes detected + filter out unwanted classes
+        self.filtered_objects = []
+        self.filtered_objects_by_frame = {}
+        for _, tracked_obj in self.initial_bboxes.items():
+            if tracked_obj.class_id in CLASSES_TO_KEEP:
+                self.filtered_objects.append(tracked_obj)
+                if self.filtered_objects_by_frame.get(tracked_obj.initial_frame_idx) is None:
+                    self.filtered_objects_by_frame[tracked_obj.initial_frame_idx] = []
+                self.filtered_objects_by_frame[tracked_obj.initial_frame_idx].append(tracked_obj)
+        self.initial_frame_idx_by_track_id = {obj.track_id: obj.initial_frame_idx for obj in self.filtered_objects}
+        # print("Filtered objects:", self.filtered_objects)
+    def run_sam(self, video_dir):
+        self.video_segments = {}  # video_segments contains the per-frame segmentation results
+        self.track_ids_to_reject = {} # {track_id: reject_all_before_frame_idx}
+        if self.filtered_objects is None or len(self.filtered_objects) == 0:
+            # There are no objects to track
+            return
+        inference_state = self.sam_model.init_state(video_path=video_dir) # NOTE: Kind of annoying that the model requires frames to be named with numbers only...
+        self.sam_model.reset_state(inference_state)
+        for obj in self.filtered_objects:
+            _, out_obj_ids, out_mask_logits = self.sam_model.add_new_points_or_box(
+                inference_state=inference_state,
+                frame_idx=obj.initial_frame_idx,
+                obj_id=obj.track_id,
+                box=obj.bbox,
+            )
+        def get_last_frame_occurrence(sam_track_ids_per_frame, track_id, current_idx):
+            for frame_idx in range(current_idx-1, -1, -1):
+                if track_id in sam_track_ids_per_frame.get(frame_idx, []):
+                    return frame_idx
+            return -1
+        # run propagation throughout the video and collect the results in a dict
+        sam_track_ids_per_frame = {}
+        long_non_existence_track_ids = []
+        with torch.cuda.amp.autocast(): # Need this for some reason to fix some casting issues... (BFloat16 and Float16 mismatches)
+            for out_frame_idx, out_obj_ids, out_mask_logits in self.sam_model.propagate_in_video(inference_state):
+                sam_tracked_ids = []
+                for pred_idx, mask_logits in enumerate(out_mask_logits):
+                    mask = (mask_logits > 0.0).cpu().numpy()
+                    track_id = out_obj_ids[pred_idx]
+                    if mask.sum() > 0 and out_frame_idx > (self.initial_frame_idx_by_track_id[track_id] - NUM_LOOK_BACK_FRAMES):
+                        sam_tracked_ids.append(track_id)
+                sam_track_ids_per_frame[out_frame_idx] = sam_tracked_ids
+                # Compare *new* YOLO preds and make sure they don't overlap with existing SAM preds
+                for new_yolo_obj in self.filtered_objects_by_frame.get(out_frame_idx, []):
+                    yolo_track_id = new_yolo_obj.track_id
+                    yolo_bbox = new_yolo_obj.bbox
+                    for ind, sam_track_id in enumerate(out_obj_ids):
+                        if (sam_track_id == yolo_track_id) or (out_frame_idx < (self.initial_frame_idx_by_track_id[sam_track_id] - NUM_LOOK_BACK_FRAMES)):
+                            continue
+                        sam_mask = (out_mask_logits[ind] > 0.0).cpu().numpy()
+                        sam_bbox = self._get_bbox_from_mask(sam_mask)
+                        if sam_bbox is None:
+                            continue
+                        # Flag the SAM prediction only if this prediction has been lost for many frames and SAM is trying to recover it
+                        # (in which case we should keep the YOLO pred)
+                        last_occurrence_idx = get_last_frame_occurrence(sam_track_ids_per_frame, sam_track_id, out_frame_idx)
+                        if (out_frame_idx - last_occurrence_idx) >= (FPS * 0.8) and last_occurrence_idx >= 0:
+                            print(sam_track_id, "long non-existence:", out_frame_idx - last_occurrence_idx, "frames")
+                            long_non_existence_track_ids.append(sam_track_id)
+                        iou = self._bbox_iou(yolo_bbox, sam_bbox)
+                        if iou > 0.8:
+                            # Reject the SAM tracked object if it was lost for many frames
+                            if sam_track_id in long_non_existence_track_ids:
+                                rejected_track_id = sam_track_id
+                                self.track_ids_to_reject[rejected_track_id] = self.initial_frame_idx_by_track_id[sam_track_id]
+                                print(f"Frame {out_frame_idx}. {yolo_track_id} & {sam_track_id} iou: {iou:.2f}. Reject: {rejected_track_id} (all) for long non-existence")
+                            else:
+                                # Otherwise, choose the obj with the latest yolo initial frame detection to reject
+                                yolo_initial_frame = self.initial_frame_idx_by_track_id[yolo_track_id] # This is just the current frame
+                                sam_initial_frame = self.initial_frame_idx_by_track_id[sam_track_id]
+                                yolo_error = yolo_initial_frame >= sam_initial_frame
+                                rejected_track_id = yolo_track_id if yolo_error else sam_track_id
+                                reject_all_before_frame_idx = self.num_frames if yolo_error else sam_initial_frame
+                                self.track_ids_to_reject[rejected_track_id] = reject_all_before_frame_idx
+                                print(f"Frame {out_frame_idx}. {yolo_track_id} & {sam_track_id} iou: {iou:.2f}. Reject: {rejected_track_id} ({'before frame #' + str(reject_all_before_frame_idx) if not yolo_error else 'all'})")
+                self.video_segments[out_frame_idx] = {
+                    out_obj_id: (out_mask_logits[i] > 0.0).cpu().numpy()
+                    for i, out_obj_id in enumerate(out_obj_ids)
+                }
+    def filter_sam_preds(self):
+        self.filtered_sam_preds = []
+        self.filtered_yolo_preds = []
+        self.num_sam_existence_frames_by_track_id = {}
+        def check_reject_pred(obj_id, frame_idx):
+            return obj_id in self.track_ids_to_reject and frame_idx < self.track_ids_to_reject[obj_id]
+        for frame_idx in range(self.num_frames):
+            self.filtered_sam_preds.append({})
+            self.filtered_yolo_preds.append({})
+            if frame_idx not in self.video_segments.keys():
+                continue
+            for obj_id, mask in self.video_segments[frame_idx].items():
+                # Only keep mask predictions that happen after the initial frame (with a small buffer)
+                #   (SAM will try to predict them before the prompt frame, and will often get them wrong)
+                if (frame_idx >= self.initial_frame_idx_by_track_id[obj_id] - NUM_LOOK_BACK_FRAMES) and not check_reject_pred(obj_id, frame_idx):
+                    self.filtered_sam_preds[frame_idx][obj_id] = mask
+                    if mask.sum() > 0:
+                        if self.num_sam_existence_frames_by_track_id.get(obj_id) is None:
+                            self.num_sam_existence_frames_by_track_id[obj_id] = 0
+                        self.num_sam_existence_frames_by_track_id[obj_id] += 1
+            for obj in self.filtered_objects:
+                if obj.initial_frame_idx == frame_idx and not check_reject_pred(obj.track_id, frame_idx):
+                    self.filtered_yolo_preds[frame_idx][obj.track_id] = obj.bbox
+    def extract_final_bboxes(self):
+        # Extract the bboxes from the predicted masks
+        #   Also filter any overlapping. At this stage if there is overlapping masks this is likely a fault on SAM's side
+        #   (id switching/collecting) and there is not much we can do for this.
+        self.final_bboxes = []
+        rejected_ids = []
+        for frame_idx in range(self.num_frames):
+            self.final_bboxes.append({})
+            for obj_id, mask in self.filtered_sam_preds[frame_idx].items():
+                mask_box = self._get_bbox_from_mask(mask)
+                if mask_box is not None:
+                    self.final_bboxes[frame_idx][obj_id] = mask_box
+            # Compute IOU overlap and eliminate duplicates
+            items_to_compare = list(self.final_bboxes[frame_idx].items()) + list(self.final_bboxes[frame_idx].items())
+            for (id0, bbox0), (id1, bbox1) in combinations(items_to_compare, 2):
+                if id0 == id1:
+                    continue
+                if id0 not in self.final_bboxes[frame_idx] or id1 not in self.final_bboxes[frame_idx]: # Could've been removed previously
+                    continue
+                if id0 in rejected_ids:
+                    del self.final_bboxes[frame_idx][id0]
+                    continue
+                if id1 in rejected_ids:
+                    del self.final_bboxes[frame_idx][id1]
+                    continue
+                iou = self._bbox_iou(bbox0, bbox1)
+                if iou > 0.8:
+                    # Rejecting the prediction that exists for the least amount of frames throughout the video
+                    frame_count0 = self.num_sam_existence_frames_by_track_id[id0]
+                    frame_count1 = self.num_sam_existence_frames_by_track_id[id1]
+                    rejected_id = id0 if frame_count0 < frame_count1 else id1
+                    del self.final_bboxes[frame_idx][rejected_id]
+                    rejected_ids.append(rejected_id)
+                    # print(f"Frame {frame_idx}. {id0} & {id1} iou: {iou}. Rejecting {rejected_id}")
+    def _bbox_iou(self, box1, box2):
+        """
+        Compute the Intersection over Union (IoU) between two bounding boxes.
+        Parameters:
+            box1: (x1, y1, x2, y2) coordinates of the first box.
+            box2: (x1, y1, x2, y2) coordinates of the second box.
+        Returns:
+            iou: Intersection over Union value (0 to 1).
+        """
+        # Get the coordinates of the intersection rectangle
+        x1 = max(box1[0], box2[0])
+        y1 = max(box1[1], box2[1])
+        x2 = min(box1[2], box2[2])
+        y2 = min(box1[3], box2[3])
+        # Compute intersection area
+        inter_width = max(0, x2 - x1)
+        inter_height = max(0, y2 - y1)
+        inter_area = inter_width * inter_height
+        # Compute areas of both bounding boxes
+        box1_area = (box1[2] - box1[0]) * (box1[3] - box1[1])
+        box2_area = (box2[2] - box2[0]) * (box2[3] - box2[1])
+        # Compute union area
+        union_area = box1_area + box2_area - inter_area
+        # Compute IoU
+        iou = inter_area / union_area if union_area > 0 else 0
+        return iou
+    def _get_bbox_from_mask(self, mask):
+        mask_points = mask.nonzero()
+        if len(mask_points[0]) == 0:
+            return None
+        x0 = min(mask_points[2])
+        y0 = min(mask_points[1])
+        x1 = max(mask_points[2])
+        y1 = max(mask_points[1])
+        return np.array([x0, y0, x1, y1])
+from collections import defaultdict
+class CVCOLORS:
+    RED = (0,0,255)
+    GREEN = (0,255,0)
+    BLUE = (255,0,0)
+    PURPLE = (247,44,200)
+    ORANGE = (44,162,247)
+    MINT = (239,255,66)
+    YELLOW = (2,255,250)
+    BROWN = (42,42,165)
+    LIME=(51,255,153)
+    GRAY=(128, 128, 128)
+    LIGHTPINK = (222,209,255)
+    LIGHTGREEN = (204,255,204)
+    LIGHTBLUE = (255,235,207)
+    LIGHTPURPLE = (255,153,204)
+    LIGHTRED = (204,204,255)
+    WHITE = (255,255,255)
+    BLACK = (0,0,0)
+    TRACKID_LOOKUP = defaultdict(lambda: (np.random.randint(50, 255), np.random.randint(50, 255), np.random.randint(50, 255)))
+    TYPE_LOOKUP = [BROWN, BLUE, PURPLE, RED, ORANGE, YELLOW, GREEN, LIGHTPURPLE, LIGHTPINK, LIGHTRED, GRAY]
+    REVERT_CHANNEL_F = lambda x: (x[2], x[1], x[0])
+if __name__ == "__main__":
+    # Checkpoint paths
+    yolo_ckpt = "yolov8x.pt" # Will auto download with utltralytics
+    # NOTE: Need to download beforehand from https://github.com/facebookresearch/sam2
+    sam2_ckpt = "/network/scratch/x/xuolga/sam2/checkpoints/sam2.1_hiera_base_plus.pt"
+    sam2_cfg = "./configs/sam2.1/sam2.1_hiera_b+.yaml"
+    yolo_sam = YoloSamProcessor(yolo_ckpt, sam2_ckpt, sam2_cfg)
+    output_dir = f"/path/to/output_dir"
+    bboxes_out_dir = os.path.join(output_dir, "bboxes")
+    labels_out_dir = os.path.join(output_dir, "json")
+    os.makedirs(bboxes_out_dir, exist_ok=True)
+    os.makedirs(labels_out_dir, exist_ok=True)
+    video_dir_root = f"/path/to/dada2000_images_12fps/images"
+    videos = ['8_90038', '8_90002', '10_90019', '10_90045', '10_90027', '10_90029', '10_90082', '10_90021', '10_90064', '10_90083', '10_90141', '10_90139', '10_90034', '10_90134', '10_90056', '10_90169', '10_90040', '11_90109', '11_90162', '11_90202', '11_90142', '11_90180', '11_90161', '11_90091', '11_90189', '11_90002', '11_90192', '11_90221', '11_90181', '12_90007', '12_90042', '13_90002', '13_90008', '13_90007', '14_90012', '14_90018', '14_90014', '14_90027', '24_90017', '24_90005', '24_90006', '24_90011', '42_90021', '43_90013', '48_90078', '48_90031', '48_90001', '48_90075', '49_90030', '49_90021', '61_90016', '61_90004']
+    for video_name in tqdm(videos):
+        cat = video_name.split("_")[0]
+        video_dir = os.path.join(video_dir_root, cat, video_name)
+        if len(os.listdir(video_dir)) > 300:
+            print("Skipping:", video_name)
+            continue
+        out_data = yolo_sam(video_dir, rel_bbox=False)
+        # Output format:
+        """
+        out_data = [ # List of Dicts, each inner dict represents one frame of the video
+                        {
+                            "image_source": img1.jpg,
+                            "labels":
+                                [ # List of Dicts, each dict represents one tracked object
+                                    {'track_id': 0, 'name': 'car', 'class':2, 'bbox': [x0, y0, x1, y1]}, # Obj 0
+                                    {...}, # Obj 1
+                                ]
+                        }, # Frame 0
+                        {...}, # Frame 1
+                    ]
+        """
+        # Plot final bboxes and save to file
+        out_json_path = os.path.join(labels_out_dir, f"{video_name}.json")
+        os.makedirs(os.path.dirname(out_json_path), exist_ok=True)
+        with open(out_json_path, 'w') as json_file:
+            json.dump(out_data, json_file, indent=1)
+        og_frames = sorted(os.listdir(video_dir))
+        out_bbox_path = os.path.join(bboxes_out_dir, video_name)
+        os.makedirs(out_bbox_path, exist_ok=True)
+        for frame_idx, frame_data in enumerate(out_data):
+            plt.figure(figsize=(9, 6))
+            plt.axis("off")
+            img = Image.open(os.path.join(video_dir, og_frames[frame_idx]))
+            plt.imshow(img)
+            for obj in frame_data["labels"]:
+                color = np.array(CVCOLORS.REVERT_CHANNEL_F(CVCOLORS.TYPE_LOOKUP[obj["class"]])) / 255.0
+                show_box(obj["box"], plt.gca(), label=str(obj["track_id"]), color=color)
+            frame_id_name = og_frames[frame_idx].split("_")[-1].split(".")[0]
+            plt.savefig(os.path.join(out_bbox_path, f"bboxes_frame_{frame_id_name}"))
+        # Save videos of bboxes
+        videos_out_dir = os.path.join(output_dir, "videos")
+        out_video_path = os.path.join(videos_out_dir, f"{video_name}_with_bboxes.mp4")
+        os.makedirs(os.path.dirname(out_video_path), exist_ok=True)
+        create_video_from_images(out_bbox_path, out_video_path, out_fps=12)

src/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .parser import parse_args
2	+ from .utils import encode_video_image, get_add_time_ids, get_samples, get_model_attr

src/utils/parser.py ADDED Viewed

	@@ -0,0 +1,472 @@

+# TODO: Clean up unused args
+def parse_args():
+    import argparse
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--project_name",
+        type=str,
+        default="car_crash",
+        help="Name of the project."
+    )
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--finetuned_svd_path",
+        type=str,
+        default=None,
+        required=False,
+        help="Path to pretrained unet model. Used to override 'pretrained_model_name_or_path', and will default to this if not provided",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--variant",
+        type=str,
+        default=None,
+        help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="out",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--eval_dir",
+        type=str,
+        default="eval",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-4,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--object_net_lr_factor",
+        type=float,
+        default=1.0,
+        help="Factor to scale the learning rate of the object network.",
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--snr_gamma",
+        type=float,
+        default=None,
+        help="SNR weighting gamma to be used if rebalancing the loss. Recommended value is 5.0. "
+        "More details here: https://arxiv.org/abs/2303.09556.",
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument(
+        "--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=1)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=0,
+        help=(
+            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+        ),
+    )
+    parser.add_argument(
+        "--rank",
+        type=int,
+        default=4,
+        help=("The dimension of the LoRA update matrices."),
+    )
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. These checkpoints are only suitable for resuming"
+            " training using `--resume_from_checkpoint`."
+        ),
+    )
+    parser.add_argument(
+        "--checkpointing_time",
+        type=int,
+        default=-1,
+        help=(
+            "Save a checkpoint of the training state every X seconds. Useful to save checkpoint when using jobs with short timeouts. If <= 0, will not save any checkpoints based on time."
+            " training using `--resume_from_checkpoint`."
+        ),
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=None,
+        help=("Max number of checkpoints to store."),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    parser.add_argument(
+        "--enable_gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument(
+        "--data_root",
+        type=str,
+        default="./data",
+        help="The root directory of the dataset.",
+    )
+    parser.add_argument(
+        "--validation_steps",
+        type=int,
+        default=50,
+        help=(
+            "Run dreambooth validation every X epochs. Dreambooth validation consists of running the prompt"
+            " `args.validation_prompt` multiple times: `args.num_validation_images`."
+        ),
+    )
+    parser.add_argument("--noise_offset", type=float, default=0, help="The scale of noise offset.")
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
+    )
+    parser.add_argument("--disable_object_condition", action="store_true", help="Whether or not to disable object condition.")
+    parser.add_argument("--encoder_hid_dim_type", type=str, default=None, help="The type of unet's input hidden.")
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="wandb",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    parser.add_argument(
+        "--run_name",
+        type=str,
+        default=None,
+        help="The name of the run log.",
+    )
+    parser.add_argument(
+        "--prediction_type",
+        type=str,
+        default=None,
+        help="The prediction_type that shall be used for training. Choose between 'epsilon' or 'v_prediction' or leave `None`. If left to `None` the default prediction type of the scheduler: `noise_scheduler.config.prediction_type` is chosen.",
+    )
+    parser.add_argument(
+        "--guidance_scale",
+        type=float,
+        default=1.0,
+        help="(Image only). A higher guidance scale value encourages the model to generate images closely linked to the text `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`."
+    )
+    parser.add_argument(
+        "--guidance_rescale",
+        type=float,
+        default=0.0,
+        help="(Image only). Guidance rescale factor from [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). Guidance rescale factor should fix overexposure when using zero terminal SNR."
+    )
+    parser.add_argument(
+        "--min_guidance_scale",
+        type=float,
+        default=1.0,
+        help="(Video generation only). The minimum guidance scale. Used for the classifier free guidance with first frame."
+    )
+    parser.add_argument(
+        "--max_guidance_scale",
+        type=float,
+        default=3.0,
+        help="(Video generation only). The maximum guidance scale. Used for the classifier free guidance with last frame."
+    )
+    parser.add_argument(
+        "--conditioning_dropout_prob",
+        type=float,
+        default=0.1,
+        help="Conditioning dropout probability. Drops out the conditionings (image and edit prompt) used in training InstructPix2Pix. See section 3.2.1 in the paper: https://arxiv.org/abs/2211.09800.",
+    )
+    parser.add_argument(
+        "--clip_length",
+        type=int,
+        default=25,
+        help="The number of frames in a clip.",
+    )
+    parser.add_argument("--use_ema", action="store_true", help="Whether to use EMA model.")
+    parser.add_argument(
+        "--non_ema_revision",
+        type=str,
+        default=None,
+        required=False,
+        help=(
+            "Revision of pretrained non-ema model identifier. Must be a branch, tag or git identifier of the local or"
+            " remote repository specified with --pretrained_model_name_or_path."
+        ),
+    )
+    parser.add_argument(
+        "--backprop_temporal_blocks_start_iter",
+        type=int,
+        default=-1,
+        help="(Video generation only). The starting iteration of only backpropagating into temporal blocks (if -1, then always backpropagate the entire network).",
+    )
+    parser.add_argument(
+        "--enable_lora",
+        action="store_true",
+        default=False,
+        help="Enable LoRA.",
+    )
+    parser.add_argument(
+        "--add_bbox_frame_conditioning",
+        action="store_true",
+        default=False,
+        help="(Video generation only). Add bbox frame conditioning.",
+    )
+    parser.add_argument(
+        "--bbox_dropout_prob",
+        type=float,
+        default=0.0,
+        help="(Video generation only). Bbox dropout probability. Drops out the bbox conditionings.",
+    )
+    parser.add_argument(
+        "--num_demo_samples",
+        type=int,
+        default=1,
+        help="Number of samples to generate during demo.",
+    )
+    parser.add_argument(
+        "--noise_aug_strength",
+        type=float,
+        default=0.02,
+        help="(Video generation only). Strength of noise augmentation.",
+    )
+    parser.add_argument(
+        "--num_inference_steps",
+        type=int,
+        default=25,
+        help="Number of inference denoising steps.",
+    )
+    parser.add_argument(
+        "--conditioning_scale",
+        type=float,
+        default=1.0,
+        help="(Controlnet only). The scale of conditioning."
+    )
+    parser.add_argument(
+        "--train_H",
+        type=int,
+        default=None,
+        help="For training, the height of the image to use. If None, the default height is 320 for video diffusion and 512 for image diffusion."
+    )
+    parser.add_argument(
+        "--train_W",
+        type=int,
+        default=None,
+        help="For training, the width of the image to use. If None, the default width is 512."
+    )
+    parser.add_argument(
+        "--eval_H",
+        type=int,
+        default=None,
+        help="For evaluation, the height of the image to use. If None, the default height is 320 for video diffusion and 512 for image diffusion."
+    )
+    parser.add_argument(
+        "--generate_bbox",
+        action="store_true",
+        default=False,
+        help="(Controlnet only). Whether to generate bbox."
+    )
+    parser.add_argument(
+        "--predict_bbox",
+        action="store_true",
+        default=False,
+        help="(Video diffusion only). Whether to predict bbox."
+    )
+    parser.add_argument(
+        "--evaluate_only",
+        action="store_true",
+        default=False,
+        help="Whether to only evaluate the model."
+    )
+    parser.add_argument(
+        "--demo_path",
+        default=None,
+        type=str,
+        help="Path where the demo samples are saved."
+    )
+    parser.add_argument(
+        "--pretrained_bbox_model",
+        default=None,
+        type=str,
+        help="Path to the pretrained bbox model."
+    )
+    parser.add_argument(
+        "--if_last_frame_trajectory",
+        action="store_true",
+        default=False,
+        help="Whether to use the last frame as the trajectory."
+    )
+    parser.add_argument(
+        "--fps",
+        type=int,
+        default=None,
+        help="FPS of the video."
+    )
+    parser.add_argument(
+        "--num_cond_bbox_frames",
+        type=int,
+        default=3,
+        help="Number of conditioning bbox frames."
+    )
+    parser.add_argument(
+        "--wandb_entity",
+        type=str,
+        default="",
+        help="Wandb entity",
+    )
+    parser.add_argument(
+        "--non_overlapping_clips",
+        action="store_true",
+        default=False,
+        help="Load clips that do not overlap in dataset",
+    )
+    parser.add_argument(
+        "--disable_wandb",
+        action="store_true",
+        default=False,
+        help="Disable wandb logging.",
+    )
+    parser.add_argument(
+        "--empty_cuda_cache",
+        action="store_true",
+        default=False,
+        help="Periodically call `torch.cuda.empty_cache` every training loop. Helps to reduce chance of memory leakage and OOM error",
+    )
+    parser.add_argument(
+        "--bbox_masking_prob",
+        type=float,
+        default=0.0,
+        help="Bbox dropout probability. Drops out the bbox conditionings, will drop certain agents in the scene.",
+    )
+    parser.add_argument(
+        "--dataset_name",
+        nargs="+",
+        type=str,
+        default="russia_crash",
+        choices=["russia_crash", "nuscenes", "dada2000", "mmau", "bdd100k"],
+        help=(
+            "The name of the Dataset to train on. Can specify a list of dataset names to be merged together."
+        ),
+    )
+    parser.add_argument(
+        "--use_action_conditioning",
+        action="store_true",
+        default=False,
+        help="Whether to use the action conditioning (ie accident type flags)"
+    )
+    parser.add_argument(
+        "--contiguous_bbox_masking_prob",
+        type=float,
+        default=0.0,
+        help="Prob to mask out bbox conditionings in a contiguous manner (ie, all bboxes after a certain frame). A random frame between [0, N] will be selected and all subsequent frames will be masked",
+    )
+    parser.add_argument(
+        "--contiguous_bbox_masking_start_ratio",
+        type=float,
+        default=0.0,
+        help="Ratio of contiguous bbox frames masking (as defined by --contiguous_bbox_masking_prob) to mask out from the start of the video instead of the end.",
+    )
+    parser.add_argument(
+        "--val_on_first_step",
+        action="store_true",
+        default=False,
+        help="Whether to run validation on the first step (useful for testing)"
+    )
+    args = parser.parse_args()
+    # default to using the same revision for the non-ema model if not specified
+    if args.non_ema_revision is None:
+        args.non_ema_revision = args.revision
+    if args.enable_lora:
+        args.backprop_temporal_blocks_start_iter = -1
+    if args.evaluate_only:
+        assert args.resume_from_checkpoint is not None, "Must provide a checkpoint to evaluate the model."
+    if args.fps is None:
+        if args.dataset_name == "bdd100k":
+            args.fps = 5
+        elif args.dataset_name == "nuscenes":
+            args.fps = 7  # NOTE: Must match the fps in nuscenes dataloader (=2 if no interpolation)
+        else:
+            args.fps = 7
+    return args