Spaces:

MaverickAlex
/

R-FLAV

Running on Zero

App Files Files Community

Alex Ergasti commited on Mar 12

Commit

b89c182

1 Parent(s): 9ca6da9

Init

Browse files

Files changed (7) hide show

app.py +140 -0
common_parser.py +50 -0
converter.py +368 -0
dataset.py +206 -0
download.py +50 -0
models.py +751 -0
utils.py +267 -0

app.py ADDED Viewed

	@@ -0,0 +1,140 @@

+import torch
+# the first flag below was False when we tested this script but True makes A100 training a lot faster:
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cudnn.allow_tf32 = True
+import os
+from diffusers.models import AutoencoderKL
+from models import FLAV_models
+from diffusion.rectified_flow import RectifiedFlow
+from diffusers.training_utils import EMAModel
+from converter import Generator
+from utils import *
+import tempfile
+import gradio as gr
+from huggingface_hub import hf_hub_download
+AUDIO_T_PER_FRAME = 1600 // 160
+#################################################################################
+#                               Global Model Setup                              #
+#################################################################################
+# These variables will be initialized in setup_models() and used in main()
+vae = None
+model = None
+vocoder = None
+audio_scale = 3.50
+def setup_models():
+    global vae, model, vocoder
+    device = "cpu"
+    vae = AutoencoderKL.from_pretrained(f"stabilityai/sd-vae-ft-ema")
+    model = FLAV_models["FLAV-B/1"](
+        latent_size= 256//8,
+        in_channels = 4,
+        num_classes = 0,
+        predict_frames = 10,
+        causal_attn = True,
+    )
+    ckpt_path = hf_hub_download(repo_id="MaverickAlex/R-FLAV", filename="aist-ema.pth")
+    state_dict = torch.load(ckpt_path)
+    ema = EMAModel(model.parameters())
+    ema.load_state_dict(state_dict)
+    ema.copy_to(model.parameters())
+    hf_hub_download(repo_id="MaverickAlex/R-FLAV", filename="vocoder-aist/config.json")
+    vocoder_path = hf_hub_download(repo_id="MaverickAlex/R-FLAV", filename="vocoder-aist/vocoder.pt")
+    vocoder_path = vocoder_path.replace("vocoder.pt", "")
+    vocoder = Generator.from_pretrained(vocoder_path)
+    vae.to(device)
+    model.to(device)
+    vocoder.to(device)
+def generate_video(num_frames=10, steps=2, seed=42):
+    global vae, model, vocoder
+    # Setup device
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    torch.manual_seed(seed)
+    # Set up generation parameters
+    video_latent_size = (1, 10, 4, 256//8, 256//8)
+    audio_latent_size = (1, 10, 1, 256, AUDIO_T_PER_FRAME)
+    rectified_flow = RectifiedFlow(num_timesteps=steps,
+                               warmup_timesteps=10,
+                               window_size=10)
+    # Generate sample
+    video, audio = generate_sample(
+        vae=vae,  # These globals are set by setup_models
+        rectified_flow=rectified_flow,
+        forward_fn=model.forward,
+        video_length=num_frames,
+        video_latent_size=video_latent_size,
+        audio_latent_size=audio_latent_size,
+        y=None,
+        cfg_scale=None,
+        device=device
+    )
+    # Convert to wav
+    wavs = get_wavs(audio, vocoder, audio_scale, device)
+    # Save to temporary files
+    temp_dir = tempfile.mkdtemp()
+    video_path = os.path.join(temp_dir, "video", "generated_video.mp4")
+    # Use the first video and wav
+    vid, wav = video[0], wavs[0]
+    save_multimodal(vid, wav, temp_dir, "generated")
+    return video_path
+def ui_generate_video(num_frames, steps,  seed):
+    try:
+        return generate_video(int(num_frames), int(steps), int(seed))
+    except Exception as e:
+        return None
+# Create Gradio interface
+with gr.Blocks(title="FLAV Video Generator") as demo:
+    gr.Markdown("# FLAV Video Generator")
+    gr.Markdown("Generate videos using the FLAV model")
+    num_frames = None
+    steps = None
+    seed = None
+    video_output = None
+    with gr.Row():
+        with gr.Column():
+            num_frames = gr.Slider(minimum=5, maximum=30, step=1, value=10, label="Number of Frames")
+            steps = gr.Slider(minimum=1, maximum=20, step=1, value=5, label="Number of Steps (multiplied by a factor of 10)")
+            seed = gr.Slider(minimum=0, maximum=9999, step=1, value=42, label="Random Seed")
+            generate_btn = gr.Button("Generate Video")
+        with gr.Column():
+            video_output = gr.PlayableVideo(label="Generated Video", width=256, height=256)
+    generate_btn.click(
+        fn=ui_generate_video,
+        inputs=[num_frames, steps, seed],
+        outputs=[video_output]
+    )
+if __name__ == "__main__":
+    setup_models()
+    demo.launch()

common_parser.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import argparse
+from models import FLAV_models
+class CommonParser:
+    def __init__(self):
+        self.parser = argparse.ArgumentParser()
+        # Datasets
+        self.parser.add_argument("--data-path", type=str, required=True)
+        self.parser.add_argument("--load-latents", action="store_true")
+        self.parser.add_argument("--num-classes", type=int, default=9)
+        self.parser.add_argument("--image-size", type=int, choices=[64, 256, 512, 1024], default=256)
+        self.parser.add_argument("--target-video-fps", type=int, default=10)
+        self.parser.add_argument("--ignore-cache", action="store_true")
+        self.parser.add_argument("--audio-scale", type=float, default=3.5009668382765917)
+        # Results
+        self.parser.add_argument("--video-length", type=int, default=1)
+        self.parser.add_argument("--predict-frames", type=int, default=10)
+        self.parser.add_argument("--results-dir", type=str, default="results")
+        self.parser.add_argument("--experiment-dir", type=str, default="")
+        self.parser.add_argument("--checkpoint-dir", type=str, default="checkpoints")
+        self.parser.add_argument("--ckpt-every", type=int, default=5_000)
+        # Models
+        self.parser.add_argument("--seed", type=int, default=42)
+        self.parser.add_argument("--model", type=str, choices=list(FLAV_models.keys()), default="FLAV-XL/2")
+        self.parser.add_argument("--vae", type=str, choices=["ema", "mse"], default="ema")
+        self.parser.add_argument("--use_sd_vae", action="store_true")
+        self.parser.add_argument("--vocoder-ckpt", type=str, default="vocoder/")
+        self.parser.add_argument("--optimizer-wd", type=float, default=0.02)
+        # Resources
+        self.parser.add_argument("--batch-size", type=int, default=4)
+        self.parser.add_argument("--num-workers", type=int, default=32)
+        self.parser.add_argument("--log-every", type=int, default=100)
+        # Config
+        self.parser.add_argument("--load-config", action="store_true")
+        self.parser.add_argument("--config-no-save", action="store_true")
+        self.parser.add_argument("--config-path", type=str, default="")
+        self.parser.add_argument("--config-name", type=str, default="config.json")
+        # Architecture
+        self.parser.add_argument("--causal-attn", action="store_true")
+        #RF
+        self.parser.add_argument("--num_timesteps", type=int, default=2)
+    def get_parser(self):
+        return self.parser

converter.py ADDED Viewed

	@@ -0,0 +1,368 @@

+import numpy as np
+from PIL import Image
+import math
+import os
+import random
+import torch
+import json
+import torch.utils.data
+import numpy as np
+import librosa
+from librosa.util import normalize
+from scipy.io.wavfile import read
+from librosa.filters import mel as librosa_mel_fn
+import torch.nn.functional as F
+import torch.nn as nn
+from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
+from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
+def normalize(images):
+    """
+    Normalize an image array to [-1,1].
+    """
+    if images.min() >= 0:
+        return 2.0 * images - 1.0
+    else:
+        return images
+def denormalize(images):
+    """
+    Denormalize an image array to [0,1].
+    """
+    if images.min() < 0:
+        return (images / 2 + 0.5).clamp(0, 1)
+    else:
+        return images.clamp(0, 1)
+MAX_WAV_VALUE = 32768.0
+def load_wav(full_path):
+    sampling_rate, data = read(full_path)
+    return data, sampling_rate
+def dynamic_range_compression(x, C=1, clip_val=1e-5):
+    return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
+def dynamic_range_decompression(x, C=1):
+    return np.exp(x) / C
+def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
+    return torch.log(torch.clamp(x, min=clip_val) * C)
+def dynamic_range_decompression_torch(x, C=1):
+    return torch.exp(x) / C
+def spectral_normalize_torch(magnitudes):
+    output = dynamic_range_compression_torch(magnitudes)
+    return output
+def spectral_de_normalize_torch(magnitudes):
+    output = dynamic_range_decompression_torch(magnitudes)
+    return output
+mel_basis = {}
+hann_window = {}
+def mel_spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
+    # if torch.min(y) < -1.:
+    #     print('min value is ', torch.min(y))
+    # if torch.max(y) > 1.:
+    #     print('max value is ', torch.max(y))
+    global mel_basis, hann_window
+    if fmax not in mel_basis:
+        mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
+        mel_basis[str(fmax)+'_'+str(y.device)] = torch.from_numpy(mel).float().to(y.device)
+        hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device)
+    y = torch.nn.functional.pad(y, (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
+    y = y.squeeze(1)
+    # complex tensor as default, then use view_as_real for future pytorch compatibility
+    spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[str(y.device)],
+                      center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=True)
+    spec = torch.view_as_real(spec)
+    spec = torch.sqrt(spec.pow(2).sum(-1)+(1e-9))
+    spec = torch.matmul(mel_basis[str(fmax)+'_'+str(y.device)], spec)
+    spec = spectral_normalize_torch(spec)
+    return spec
+def spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
+    global hann_window
+    hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device)
+    y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
+    y = y.squeeze(1)
+    # complex tensor as default, then use view_as_real for future pytorch compatibility
+    spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[str(y.device)],
+                      center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=True)
+    spec = torch.view_as_real(spec)
+    spec = torch.sqrt(spec.pow(2).sum(-1)+(1e-9))
+    return spec
+def normalize_spectrogram(
+    spectrogram: torch.Tensor,
+    max_value: float = 200,
+    min_value: float = 1e-5,
+    power: float = 1.,
+    inverse: bool = False
+) -> torch.Tensor:
+    # Rescale to 0-1
+    max_value = np.log(max_value) # 5.298317366548036
+    min_value = np.log(min_value) # -11.512925464970229
+    assert spectrogram.max() <= max_value and spectrogram.min() >= min_value
+    data = (spectrogram - min_value) / (max_value - min_value)
+    # Invert
+    if inverse:
+        data = 1 - data
+    # Apply the power curve
+    data = torch.pow(data, power)
+    return data
+def denormalize_spectrogram(
+    data: torch.Tensor,
+    max_value: float = 200,
+    min_value: float = 1e-5,
+    power: float = 1,
+) -> torch.Tensor:
+    max_value = np.log(max_value)
+    min_value = np.log(min_value)
+    # Reverse the power curve
+    data = torch.pow(data, 1 / power)
+    # Rescale to max value
+    spectrogram = data * (max_value - min_value) + min_value
+    return spectrogram
+def get_mel_spectrogram_from_audio(audio, device="cuda"):
+    audio = audio / MAX_WAV_VALUE
+    audio = librosa.util.normalize(audio) * 0.95
+    audio = torch.FloatTensor(audio)
+    audio = audio.unsqueeze(0)
+    waveform = audio
+    spec = mel_spectrogram(waveform, n_fft=2048, num_mels=256, sampling_rate=16000, hop_size=160, win_size=1024, fmin=0, fmax=8000, center=False)
+    return audio, spec
+LRELU_SLOPE = 0.1
+MAX_WAV_VALUE = 32768.0
+class AttrDict(dict):
+    def __init__(self, *args, **kwargs):
+        super(AttrDict, self).__init__(*args, **kwargs)
+        self.__dict__ = self
+def get_config(config_path):
+    config = json.loads(open(config_path).read())
+    config = AttrDict(config)
+    return config
+def init_weights(m, mean=0.0, std=0.01):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        m.weight.data.normal_(mean, std)
+def apply_weight_norm(m):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        weight_norm(m)
+def get_padding(kernel_size, dilation=1):
+    return int((kernel_size*dilation - dilation)/2)
+class ResBlock1(torch.nn.Module):
+    def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)):
+        super(ResBlock1, self).__init__()
+        self.h = h
+        self.convs1 = nn.ModuleList([
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
+                               padding=get_padding(kernel_size, dilation[0]))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
+                               padding=get_padding(kernel_size, dilation[1]))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
+                               padding=get_padding(kernel_size, dilation[2])))
+        ])
+        self.convs1.apply(init_weights)
+        self.convs2 = nn.ModuleList([
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+                               padding=get_padding(kernel_size, 1))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+                               padding=get_padding(kernel_size, 1))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+                               padding=get_padding(kernel_size, 1)))
+        ])
+        self.convs2.apply(init_weights)
+    def forward(self, x):
+        for c1, c2 in zip(self.convs1, self.convs2):
+            xt = F.leaky_relu(x, LRELU_SLOPE)
+            xt = c1(xt)
+            xt = F.leaky_relu(xt, LRELU_SLOPE)
+            xt = c2(xt)
+            x = xt + x
+        return x
+    def remove_weight_norm(self):
+        for l in self.convs1:
+            remove_weight_norm(l)
+        for l in self.convs2:
+            remove_weight_norm(l)
+class ResBlock2(torch.nn.Module):
+    def __init__(self, h, channels, kernel_size=3, dilation=(1, 3)):
+        super(ResBlock2, self).__init__()
+        self.h = h
+        self.convs = nn.ModuleList([
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
+                               padding=get_padding(kernel_size, dilation[0]))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
+                               padding=get_padding(kernel_size, dilation[1])))
+        ])
+        self.convs.apply(init_weights)
+    def forward(self, x):
+        for c in self.convs:
+            xt = F.leaky_relu(x, LRELU_SLOPE)
+            xt = c(xt)
+            x = xt + x
+        return x
+    def remove_weight_norm(self):
+        for l in self.convs:
+            remove_weight_norm(l)
+class Generator(torch.nn.Module):
+    def __init__(self, h):
+        super(Generator, self).__init__()
+        self.h = h
+        self.num_kernels = len(h.resblock_kernel_sizes)
+        self.num_upsamples = len(h.upsample_rates)
+        self.conv_pre = weight_norm(Conv1d(h.num_mels, h.upsample_initial_channel, 7, 1, padding=3)) # change: 80 --> 512
+        resblock = ResBlock1 if h.resblock == '1' else ResBlock2
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)):
+            if (k-u) % 2 == 0:
+                self.ups.append(weight_norm(
+                    ConvTranspose1d(h.upsample_initial_channel//(2**i), h.upsample_initial_channel//(2**(i+1)),
+                                    k, u, padding=(k-u)//2)))
+            else:
+                self.ups.append(weight_norm(
+                    ConvTranspose1d(h.upsample_initial_channel//(2**i), h.upsample_initial_channel//(2**(i+1)),
+                                    k, u, padding=(k-u)//2+1, output_padding=1)))
+            # self.ups.append(weight_norm(
+            #     ConvTranspose1d(h.upsample_initial_channel//(2**i), h.upsample_initial_channel//(2**(i+1)),
+            #                     k, u, padding=(k-u)//2)))
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = h.upsample_initial_channel//(2**(i+1))
+            for j, (k, d) in enumerate(zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)):
+                self.resblocks.append(resblock(h, ch, k, d))
+        self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
+        self.ups.apply(init_weights)
+        self.conv_post.apply(init_weights)
+    def forward(self, x):
+        x = self.conv_pre(x)
+        for i in range(self.num_upsamples):
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            x = self.ups[i](x)
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i*self.num_kernels+j](x)
+                else:
+                    xs += self.resblocks[i*self.num_kernels+j](x)
+            x = xs / self.num_kernels
+        x = F.leaky_relu(x)
+        x = self.conv_post(x)
+        x = torch.tanh(x)
+        return x
+    def remove_weight_norm(self):
+        for l in self.ups:
+            remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+        remove_weight_norm(self.conv_pre)
+        remove_weight_norm(self.conv_post)
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, subfolder=None):
+        if subfolder is not None:
+            pretrained_model_name_or_path = os.path.join(pretrained_model_name_or_path, subfolder)
+        config_path = os.path.join(pretrained_model_name_or_path, "config.json")
+        ckpt_path = os.path.join(pretrained_model_name_or_path, "vocoder.pt")
+        config = get_config(config_path)
+        vocoder = cls(config)
+        state_dict_g = torch.load(ckpt_path)
+        vocoder.load_state_dict(state_dict_g["generator"])
+        vocoder.eval()
+        vocoder.remove_weight_norm()
+        return vocoder
+    @torch.no_grad()
+    def inference(self, mels, lengths=None):
+        self.eval()
+        with torch.no_grad():
+            wavs = self(mels).squeeze(1)
+        wavs = (wavs.cpu().numpy() * MAX_WAV_VALUE).astype("int16")
+        if lengths is not None:
+            wavs = wavs[:, :lengths]
+        return wavs

dataset.py ADDED Viewed

	@@ -0,0 +1,206 @@

+# Copyright (c) Meta Platforms, Inc. All Rights Reserved
+import os.path as osp
+import math
+import pickle
+import warnings
+import glob
+import torch.utils.data as data
+import torch.nn.functional as F
+from torchvision.datasets.video_utils import VideoClips
+from converter import  normalize, normalize_spectrogram, get_mel_spectrogram_from_audio
+from torchaudio import transforms as Ta
+from torchvision import transforms as Tv
+from torchvision.io.video import read_video
+import torch
+from torchvision.transforms import InterpolationMode
+class LatentDataset(data.Dataset):
+    """ Generic dataset for latents pregenerated from a dataset
+    Returns a dictionary of latents encoded from the original dataset """
+    exts = ['pt']
+    def __init__(self, data_folder, train=True):
+        """
+        Args:
+            data_folder: path to the folder with videos. The folder
+                should contain a 'train' and a 'test' directory,
+                each with corresponding videos stored
+        """
+        super().__init__()
+        self.train = train
+        folder = osp.join(data_folder, 'train' if train else 'test')
+        self.files = sum([glob.glob(osp.join(folder, '**', f'*.{ext}'), recursive=True)
+                     for ext in self.exts], [])
+        warnings.filterwarnings('ignore')
+    def __len__(self):
+        return len(self.files)
+    def __getitem__(self, idx):
+        while True:
+            try:
+                latents = torch.load(self.files[idx], map_location="cpu")
+            except Exception as e:
+                print(f"Dataset Exception: {e}")
+                idx = (idx + 1) % len(self.files)
+                continue
+            break
+        return latents["video"], latents["audio"], latents["y"]
+class AudioVideoDataset(data.Dataset):
+    """ Generic dataset for videos files stored in folders
+    Returns BCTHW videos in the range [-0.5, 0.5] """
+    exts = ['avi', 'mp4', 'webm']
+    def __init__(self, data_folder, train=True, resolution=64, sample_every_n_frames=1, sequence_length=8, audio_channels=1, sample_rate=16000, min_length=1, ignore_cache=False, labeled=True, target_video_fps=10):
+        """
+        Args:
+            data_folder: path to the folder with videos. The folder
+                should contain a 'train' and a 'test' directory,
+                each with corresponding videos stored
+            sequence_length: length of extracted video sequences
+        """
+        super().__init__()
+        self.train = train
+        self.sequence_length = sequence_length
+        self.resolution = resolution
+        self.sample_every_n_frames = sample_every_n_frames
+        self.audio_channels = audio_channels
+        self.sample_rate = sample_rate
+        self.min_length = min_length
+        self.labeled = labeled
+        folder = osp.join(data_folder, 'train' if train else 'test')
+        files = sum([glob.glob(osp.join(folder, '**', f'*.{ext}'), recursive=True)
+                     for ext in self.exts], [])
+        # hacky way to compute # of classes (count # of unique parent directories)
+        self.classes = list(set([get_parent_dir(f) for f in files]))
+        self.classes.sort()
+        self.class_to_label = {c: i for i, c in enumerate(self.classes)}
+        warnings.filterwarnings('ignore')
+        cache_file = osp.join(folder, f"metadata_{self.sequence_length}.pkl")
+        if not osp.exists(cache_file) or ignore_cache or True:
+            clips = VideoClips(files, self.sequence_length, num_workers=32, frame_rate=target_video_fps)
+            # pickle.dump(clips.metadata, open(cache_file, 'wb'))
+        else:
+            metadata = pickle.load(open(cache_file, 'rb'))
+            clips = VideoClips(files, self.sequence_length,
+                               _precomputed_metadata=metadata)
+        # self._clips = clips.subset(np.arange(24))
+        self._clips = clips
+    @property
+    def n_classes(self):
+        return len(self.classes)
+    def __len__(self):
+        return self._clips.num_clips()
+    def __getitem__(self, idx):
+        resolution = self.resolution
+        while True:
+            try:
+                video, _, info, _ = self._clips.get_clip(idx)
+            except Exception:
+                idx = (idx + 1) % self._clips.num_clips()
+                continue
+            break
+        return preprocess(video, resolution, sample_every_n_frames=self.sample_every_n_frames), self.get_audio(info, idx), self.get_label(idx)
+    def get_label(self, idx):
+        if not self.labeled:
+            return -1
+        video_idx, clip_idx = self._clips.get_clip_location(idx)
+        class_name = get_parent_dir(self._clips.video_paths[video_idx])
+        label = self.class_to_label[class_name]
+        return label
+    def get_audio(self, info, idx):
+        video_idx, clip_idx = self._clips.get_clip_location(idx)
+        video_path = self._clips.video_paths[video_idx]
+        video_fps = self._clips.video_fps[video_idx]
+        duration_per_frame = self._clips.video_pts[video_idx][1] - self._clips.video_pts[video_idx][0]
+        clip_pts = self._clips.clips[video_idx][clip_idx]
+        clip_pid = clip_pts // duration_per_frame
+        start_t = (clip_pid[0] / video_fps * 1. ).item()
+        end_t = ((clip_pid[-1] + 1) / video_fps * 1. ).item()
+        _, raw_audio, _ = read_video(video_path,start_t, end_t, pts_unit='sec')
+        raw_audio = prepare_audio(raw_audio, info["audio_fps"], self.sample_rate, self.audio_channels, self.sequence_length, self.min_length)
+        _, spec = get_mel_spectrogram_from_audio(raw_audio[0].numpy())
+        norm_spec = normalize_spectrogram(spec)
+        norm_spec = normalize(norm_spec) # normalize to [-1, 1], because pipeline do not normalize for torch.Tensor input
+        norm_spec.unsqueeze(1) # add channel dimension
+        return norm_spec
+        #return raw_audio[0]
+def get_parent_dir(path):
+    return osp.basename(osp.dirname(path))
+def preprocess(video, resolution, sample_every_n_frames=1):
+    video = video.permute(0, 3, 1, 2).float() / 255.  # TCHW
+    old_size = video.shape[2:4]
+    ratio = min(float(resolution)/(old_size[0]), float(resolution)/(old_size[1]) )
+    new_size = tuple([int(i*ratio) for i in old_size])
+    pad_w = resolution - new_size[1]
+    pad_h = resolution- new_size[0]
+    top,bottom = pad_h//2, pad_h-(pad_h//2)
+    left,right = pad_w//2, pad_w -(pad_w//2)
+    transform = Tv.Compose([Tv.Resize(new_size, interpolation=InterpolationMode.BICUBIC), Tv.Pad((left, top, right, bottom))])
+    video_new = transform(video)
+    video_new = video_new*2-1
+    return video_new
+def pad_crop_audio(audio, target_length):
+    target_length = int(target_length)
+    n, s = audio.shape
+    start = 0
+    end = start + target_length
+    output = audio.new_zeros([n, target_length])
+    output[:, :min(s, target_length)] = audio[:, start:end]
+    return output
+def prepare_audio(audio, in_sr, target_sr, target_channels, sequence_length, min_length):
+    if in_sr != target_sr:
+        resample_tf = Ta.Resample(in_sr, target_sr)
+        audio = resample_tf(audio)
+    max_length = target_sr/10*sequence_length
+    target_length = max_length + (min_length - (max_length % min_length)) % min_length
+    audio = pad_crop_audio(audio, target_length)
+    audio = set_audio_channels(audio, target_channels)
+    return audio
+def set_audio_channels(audio, target_channels):
+    if target_channels == 1:
+        # Convert to mono
+        # audio = audio.mean(0, keepdim=True)
+        audio = audio[:1, :]
+    elif target_channels == 2:
+        # Convert to stereo
+        if audio.shape[0] == 1:
+            audio = audio.repeat(2, 1)
+        elif audio.shape[0] > 2:
+            audio = audio[:2, :]
+    return audio

download.py ADDED Viewed

	@@ -0,0 +1,50 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Functions for downloading pre-trained DiT models
+"""
+from torchvision.datasets.utils import download_url
+import torch
+import os
+pretrained_models = {'DiT-XL-2-512x512.pt', 'DiT-XL-2-256x256.pt'}
+def find_model(model_name):
+    """
+    Finds a pre-trained DiT model, downloading it if necessary. Alternatively, loads a model from a local path.
+    """
+    if model_name in pretrained_models:  # Find/download our pre-trained DiT checkpoints
+        return download_model(model_name)
+    else:  # Load a custom DiT checkpoint:
+        assert os.path.isfile(model_name), f'Could not find DiT checkpoint at {model_name}'
+        checkpoint = torch.load(model_name, map_location=lambda storage, loc: storage)
+        if "ema" in checkpoint:  # supports checkpoints from train.py
+            checkpoint = checkpoint["ema"]
+        return checkpoint
+def download_model(model_name):
+    """
+    Downloads a pre-trained DiT model from the web.
+    """
+    assert model_name in pretrained_models
+    local_path = f'pretrained_models/{model_name}'
+    if not os.path.isfile(local_path):
+        os.makedirs('pretrained_models', exist_ok=True)
+        web_path = f'https://dl.fbaipublicfiles.com/DiT/models/{model_name}'
+        download_url(web_path, 'pretrained_models')
+    model = torch.load(local_path, map_location=lambda storage, loc: storage)
+    return model
+if __name__ == "__main__":
+    # Download all DiT checkpoints
+    for model in pretrained_models:
+        download_model(model)
+    print('Done.')

models.py ADDED Viewed

	@@ -0,0 +1,751 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------
+# References:
+# GLIDE: https://github.com/openai/glide-text2im
+# MAE: https://github.com/facebookresearch/mae/blob/main/models_mae.py
+# --------------------------------------------------------
+import torch
+import torch.nn as nn
+import numpy as np
+import math
+from timm.models.vision_transformer import PatchEmbed, Attention, Mlp
+import einops
+import torch.utils.checkpoint as checkpoint
+from transformers import PreTrainedModel
+import random
+class MelPatchEmbed(nn.Module):
+    """ Image to Patch Embedding
+    """
+    def __init__(self, n_mels, n_frames, patch_size=16, in_chans=1, embed_dim=768):
+        super().__init__()
+        num_patches = (n_mels // patch_size) * (n_frames // patch_size)
+        self.patch_size = patch_size
+        self.num_patches = int(num_patches)
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+    def forward(self, x):
+        x = self.proj(x).flatten(2).transpose(1, 2)
+        return x
+class SelfAttention(nn.Module):
+    def __init__(
+            self,
+            dim: int,
+            num_heads: int = 8,
+            qkv_bias: bool = False,
+            qk_norm: bool = False,
+            attn_drop: float = 0.,
+            proj_drop: float = 0.,
+            norm_layer: nn.Module = nn.LayerNorm,
+            is_causal: bool = False,
+    ) -> None:
+        super().__init__()
+        assert dim % num_heads == 0, 'dim should be divisible by num_heads'
+        self.is_causal = is_causal
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.scale = self.head_dim ** -0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.q_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
+        self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv.unbind(0)
+        q, k = self.q_norm(q), self.k_norm(k)
+        x = torch.nn.functional.scaled_dot_product_attention(
+            q, k, v,
+            dropout_p=self.attn_drop.p if self.training else 0.,
+            is_causal=self.is_causal
+        )
+        x = x.transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class CrossAttention(nn.Module):
+    def __init__(
+            self,
+            dim,
+            num_heads=8,
+            qkv_bias=False,
+            attn_drop=0.,
+            proj_drop=0.,
+            mask_attn=False,
+    ):
+        super().__init__()
+        self.mask_attn = mask_attn
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights
+        self.scale = head_dim ** -0.5
+        self.wq = nn.Linear(dim, dim, bias=qkv_bias)
+        self.wkv = nn.Linear(dim, dim*2, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x, cond):
+        B, N, C = x.shape
+        q = self.wq(x)
+        q = einops.rearrange(q, 'B N (H D) -> B H N D', H=self.num_heads)
+        kv = self.wkv(cond) # BMD
+        kv = einops.rearrange(kv, 'B N (K H D) ->K B H N D', H=self.num_heads, K=2)
+        k = kv[0]
+        v = kv[1]
+        x = torch.nn.functional.scaled_dot_product_attention(q, k, v)
+        x = einops.rearrange(x, 'B H N D -> B N (H D)')
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+def modulate(x, shift, scale):
+    return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
+def temporalModulate(x, shift, scale):
+    """
+    Modulate the input tensor x with the given shift and scale tensors.
+    :param x: the input tensor to modulate with shape (B, T, L, D).
+    :param shift: the shift tensor with shape (B, T, D).
+    :param scale: the scale tensor with shape (B, T, D).
+    """
+    return x * (1 + scale.unsqueeze(2)) + shift.unsqueeze(2)
+#################################################################################
+#               Embedding Layers for Timesteps and Class Labels                 #
+#################################################################################
+class TimestepEmbedder(nn.Module):
+    """
+    Embeds scalar timesteps into vector representations.
+    """
+    def __init__(self, hidden_size, frequency_embedding_size=256):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_size, bias=True),
+            nn.SiLU(),
+            nn.Linear(hidden_size, hidden_size, bias=True),
+        )
+        self.frequency_embedding_size = frequency_embedding_size
+    @staticmethod
+    def timestep_embedding(t, dim, max_period=10000):
+        """
+        Create sinusoidal timestep embeddings.
+        :param t: a 1-D Tensor of N indices, one per batch element.
+                          These may be fractional.
+        :param dim: the dimension of the output.
+        :param max_period: controls the minimum frequency of the embeddings.
+        :return: an (N, D) Tensor of positional embeddings.
+        """
+        # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
+        half = dim // 2
+        freqs = torch.exp(
+            -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
+        ).to(device=t.device)
+        args = t[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+        return embedding
+    def forward(self, t):
+        t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
+        t_emb = self.mlp(t_freq)
+        return t_emb
+class AudioEmbedder(nn.Module):
+    """
+    Embeds scalar timesteps into vector representations.
+    """
+    def __init__(self, n_mels, hidden_size):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(n_mels, hidden_size, bias=True),
+            nn.SiLU(),
+            nn.Linear(hidden_size, hidden_size, bias=True),
+        )
+        # TODO: Activation?
+    def forward(self, a):
+        a = self.mlp(a)
+        return a
+    def init_weights(self):
+        nn.init.xavier_uniform_(self.mlp[0].weight)
+        nn.init.constant_(self.mlp[0].bias, 0)
+        nn.init.xavier_uniform_(self.mlp[2].weight)
+        nn.init.constant_(self.mlp[2].bias, 0)
+class LabelEmbedder(nn.Module):
+    """
+    Embeds class labels into vector representations. Also handles label dropout for classifier-free guidance.
+    """
+    def __init__(self, num_classes, hidden_size, dropout_prob):
+        super().__init__()
+        use_cfg_embedding = dropout_prob > 0
+        self.embedding_table = nn.Embedding(num_classes + use_cfg_embedding, hidden_size)
+        self.num_classes = num_classes
+        self.dropout_prob = dropout_prob
+    def token_drop(self, labels, force_drop_ids=None):
+        """
+        Drops labels to enable classifier-free guidance.
+        """
+        if force_drop_ids is None:
+            drop_ids = torch.rand(labels.shape[0], device=labels.device) < self.dropout_prob
+        else:
+            drop_ids = force_drop_ids == 1
+        labels = torch.where(drop_ids, self.num_classes, labels)
+        return labels
+    def forward(self, labels, train, force_drop_ids=None):
+        use_dropout = self.dropout_prob > 0
+        if (train and use_dropout) or (force_drop_ids is not None):
+            labels = self.token_drop(labels, force_drop_ids)
+        embeddings = self.embedding_table(labels)
+        return embeddings
+#################################################################################
+#                                 Core FLAV Model                                #
+#################################################################################
+class FLAVBlock(nn.Module):
+    """
+    A FLAV block with adaptive layer norm zero (adaLN-Zero) conditioning.
+    """
+    def __init__(self, hidden_size, num_heads, mlp_ratio=4.0, grad_ckpt=False, causal_attn=False, **block_kwargs):
+        super().__init__()
+        self.video_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.audio_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        # self.video_audio_attn = SelfAttention(hidden_size, num_heads=num_heads, qkv_bias=True, **block_kwargs)
+        self.video_spatial_attn = SelfAttention(hidden_size, num_heads=num_heads, qkv_bias=True, **block_kwargs)
+        self.video_temporal_attn = SelfAttention(hidden_size, num_heads=num_heads, qkv_bias=True, is_causal=causal_attn, **block_kwargs)
+        self.audio_spatial_attn = SelfAttention(hidden_size, num_heads=num_heads, qkv_bias=True, is_causal=causal_attn, **block_kwargs)
+        # self.audio_temporal_attn = SelfAttention(hidden_size, num_heads=num_heads, qkv_bias=True, is_causal=causal_attn, **block_kwargs)
+        self.video_norm3 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.audio_norm3 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.video_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.audio_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        approx_gelu = lambda: nn.GELU(approximate="tanh")
+        self.video_mlp = Mlp(in_features=hidden_size, hidden_features=mlp_hidden_dim, act_layer=approx_gelu, drop=0)
+        self.audio_mlp = Mlp(in_features=hidden_size, hidden_features=mlp_hidden_dim, act_layer=approx_gelu, drop=0)
+        self.video_adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(hidden_size, 6 * hidden_size, bias=True)
+        )
+        self.audio_adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(hidden_size, 3 * hidden_size, bias=True)
+        )
+        self.video_scale = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(hidden_size, 3 * hidden_size, bias=True)
+        )
+        self.audio_scale = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(hidden_size, 3 * hidden_size, bias=True)
+        )
+        self.v_avg_proj = nn.Sequential(
+            nn.Linear(hidden_size, hidden_size, bias=True),
+        )
+        self.a_avg_proj = nn.Sequential(
+            nn.Linear(hidden_size, hidden_size, bias=True),
+        )
+        self.grad_ckpt = grad_ckpt
+    def forward(self,v, a, v_c, a_c):
+        if self.grad_ckpt:
+            return checkpoint.checkpoint(self._forward, v, a, v_c, a_c, use_reentrant=False)
+        else:
+            return self._forward(v, a, v_c, a_c)
+    def _forward(self, v, a, v_c, a_c):
+        """
+            v: Size of (B, T, Lv, D)
+            a: Size of (B, T, La, D)
+            v_c: Size of (B, T, D)
+            a_c: Size of (B, T, D)
+        """
+        video_shift_msa, video_scale_msa, video_gate_msa, video_shift_tmsa, video_scale_tmsa, video_gate_tmsa = self.video_adaLN_modulation(v_c).chunk(6, dim=-1)
+        # audio_shift_msa, audio_scale_msa, audio_gate_msa, audio_shift_tmsa, audio_scale_tmsa, audio_gate_tmsa = self.audio_adaLN_modulation(a_c).chunk(6, dim=-1)
+        audio_shift_msa, audio_scale_msa, audio_gate_msa = self.audio_adaLN_modulation(a_c).chunk(3, dim=-1)
+        B, T, L, D = v.shape
+        v_att = temporalModulate(self.video_norm1(v), video_shift_msa, video_scale_msa)
+        v_att = einops.rearrange(v_att, 'B T L D -> (B T) L D')
+        v_att = v + video_gate_msa.unsqueeze(2)*(self.video_spatial_attn(v_att).view(B, T, L, D))
+        v = v_att
+        v_att = temporalModulate(self.video_norm2(v_att), video_shift_tmsa, video_scale_tmsa)
+        v_att = einops.rearrange(v_att, 'B T L D -> (B L) T D', T=T)
+        v_att = einops.rearrange(self.video_temporal_attn(v_att), "(B L) T D -> B T L D", B=B)
+        v = v + video_gate_tmsa.unsqueeze(2)*v_att
+        a_att = temporalModulate(self.audio_norm1(a), audio_shift_msa, audio_scale_msa)
+        a_att = einops.rearrange(a_att, 'B T L D -> B (T L) D')
+        a_att = a + audio_gate_msa.unsqueeze(2)*(self.audio_spatial_attn(a_att).view(B, T, -1, D))
+        a = a_att
+        a_avg = self.a_avg_proj(a.mean(dim=2)) # B T D
+        v_avg = self.v_avg_proj(v.mean(dim=2)) # B T D
+        v_avg += a_c
+        a_avg += v_c
+        scale_v, shift_v, gate_v = self.video_scale(a_avg).chunk(3, dim=-1)
+        scale_a, shift_a, gate_a = self.audio_scale(v_avg).chunk(3, dim=-1)
+        v = v + gate_v.unsqueeze(2) * self.video_mlp(temporalModulate(self.video_norm3(v), shift_v, scale_v))
+        a = a + gate_a.unsqueeze(2) * self.audio_mlp(temporalModulate(self.audio_norm3(a), shift_a, scale_a))
+        return v, a
+    def _spatial_attn(self, x, b_size, attn_func):
+        x = einops.rearrange(x, "(B N) T D -> (B T) N D", B=b_size)
+        x = attn_func(x)
+        x = einops.rearrange(x, "(B T) N D -> (B N) T D", B=b_size)
+        return x
+class FinalLayer(nn.Module):
+    """
+    The final layer of FLAV.
+    """
+    def __init__(self, hidden_size, patch_size, out_channels):
+        super().__init__()
+        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True)
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(hidden_size, 2 * hidden_size, bias=True)
+        )
+    def forward(self, x, c):
+        shift, scale = self.adaLN_modulation(c).chunk(2, dim=-1)
+        x = temporalModulate(self.norm_final(x), shift, scale)
+        x = self.linear(x)
+        return x
+class FLAV(nn.Module):
+    """
+    Diffusion model with a Transformer backbone.
+    """
+    def __init__(
+        self,
+        latent_size=None,
+        patch_size=2,
+        in_channels=4,
+        hidden_size=1152,
+        depth=28,
+        num_heads=16,
+        mlp_ratio=4.0,
+        class_dropout_prob=0.1,
+        num_classes=1000,
+        predict_frames = 1,
+        grad_ckpt = False,
+        n_mels=256,
+        audio_fr = 16000,
+        causal_attn = False,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = in_channels
+        self.patch_size = patch_size
+        self.num_heads = num_heads
+        self.predict_frames = predict_frames
+        self.grad_ckpt = grad_ckpt
+        self.n_mels = n_mels
+        self.audio_fr = audio_fr
+        self.latent_size = latent_size # T H W
+        self.num_classes = num_classes
+        self.v_embedder = PatchEmbed(latent_size, patch_size, in_channels, hidden_size, bias=True)
+        self.a_embedder = nn.Linear(n_mels, hidden_size, bias=True)
+        self.video_t_embedder = TimestepEmbedder(hidden_size)
+        self.audio_t_embedder = TimestepEmbedder(hidden_size)
+        if self.num_classes > 0:
+            self.video_y_embedder = LabelEmbedder(num_classes, hidden_size, class_dropout_prob)
+            self.audio_y_embedder = LabelEmbedder(num_classes, hidden_size, class_dropout_prob)
+        num_patches = self.v_embedder.num_patches
+        self.video_spatial_pos_embed = nn.Parameter(torch.zeros(1, 1, num_patches, hidden_size), requires_grad=True)
+        self.video_temporal_pos_embed = nn.Parameter(torch.zeros(1, self.predict_frames, 1, hidden_size), requires_grad=True)
+        self.audio_spatial_pos_embed = nn.Parameter(torch.zeros(1, 1, 10, hidden_size), requires_grad=True)
+        self.audio_temporal_pos_embed = nn.Parameter(torch.zeros(1, self.predict_frames, 1, hidden_size), requires_grad=True)
+        self.blocks = nn.ModuleList([
+            FLAVBlock(hidden_size, num_heads, mlp_ratio=mlp_ratio, grad_ckpt=grad_ckpt, causal_attn=causal_attn) for _ in range(depth)
+        ])
+        self.video_final_layer = FinalLayer(hidden_size, patch_size, self.out_channels)
+        self.audio_final_layer = FinalLayer(hidden_size, 1, n_mels)
+        self.initialize_weights()
+    def initialize_weights(self):
+        # Initialize transformer layers:
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+        # Initialize patch_embed like nn.Linear (instead of nn.Conv2d):
+        w = self.v_embedder.proj.weight.data
+        nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+        nn.init.constant_(self.v_embedder.proj.bias, 0)
+        if self.num_classes > 0:
+            nn.init.normal_(self.video_y_embedder.embedding_table.weight, std=0.02)
+            nn.init.normal_(self.audio_y_embedder.embedding_table.weight, std=0.02)
+        # Initialize timestep embedding MLP:
+        nn.init.normal_(self.video_t_embedder.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.video_t_embedder.mlp[2].weight, std=0.02)
+        nn.init.normal_(self.audio_t_embedder.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.audio_t_embedder.mlp[2].weight, std=0.02)
+        # Zero-out adaLN modulation layers in FLAV blocks:
+        for block in self.blocks:
+            nn.init.constant_(block.video_adaLN_modulation[-1].weight, 0)
+            nn.init.constant_(block.video_adaLN_modulation[-1].bias, 0)
+            nn.init.constant_(block.audio_adaLN_modulation[-1].weight, 0)
+            nn.init.constant_(block.audio_adaLN_modulation[-1].bias, 0)
+            nn.init.constant_(block.video_scale[-1].weight, 0)
+            nn.init.constant_(block.video_scale[-1].bias, 0)
+            nn.init.constant_(block.audio_scale[-1].weight, 0)
+            nn.init.constant_(block.audio_scale[-1].bias, 0)
+        # Zero-out output layers:
+        nn.init.constant_(self.video_final_layer.adaLN_modulation[-1].weight, 0)
+        nn.init.constant_(self.video_final_layer.adaLN_modulation[-1].bias, 0)
+        nn.init.constant_(self.video_final_layer.linear.weight, 0)
+        nn.init.constant_(self.video_final_layer.linear.bias, 0)
+        nn.init.constant_(self.audio_final_layer.adaLN_modulation[-1].weight, 0)
+        nn.init.constant_(self.audio_final_layer.adaLN_modulation[-1].bias, 0)
+        nn.init.constant_(self.audio_final_layer.linear.weight, 0)
+        nn.init.constant_(self.audio_final_layer.linear.bias, 0)
+    def unpatchify(self, x):
+        """
+        x: (N, T, patch_size**2 * C)
+        imgs: (N, C, H, W)
+        """
+        c = self.out_channels
+        p = self.v_embedder.patch_size[0]
+        h = w = int(x.shape[1] ** 0.5)
+        assert h * w == x.shape[1]
+        x = x.reshape(shape=(x.shape[0], h, w, p, p, c))
+        x = torch.einsum('nhwpqc->nchpwq', x)
+        imgs = x.reshape(shape=(x.shape[0], c, h * p, h * p))
+        return imgs
+    def _apply_rnd_mask(self, input, mask, device="cuda"):
+        input_rnd = torch.rand(input[0].shape).unsqueeze(0).to(device=device)*2 - 1
+        return self._apply_mask(input, mask, input_rnd)
+    def _apply_zero_mask(self, input, mask, device="cuda"):
+        input_zero= torch.zeros(input[0].shape).unsqueeze(0).to(device=device)
+        return self._apply_mask(input, mask, input_zero)
+    def _get_frames_mask(self, bs):
+        """
+        bs: batch size
+        returns a boolean mask to be applied to condition frames
+        to mask a selected number of random frames
+        """
+        fmask = np.full(self.cond_frames*bs, False)
+        frames = list(range(self.cond_frames))
+        for b in range(bs):
+            if random.randint(0, 100) < self.mask_freq:
+                sub_frames = random.sample(frames, min(self.cond_frames, self.frames_to_mask))
+                idxs = [f+(b*self.cond_frames) for f in sub_frames]
+                fmask[idxs] = True
+        return fmask
+    def _get_batch_mask(self, bs):
+        """
+        bs: batch size
+        returns a boolean mask to be applied to condition frames
+        to mask a random number of condition sequences in a batch
+        """
+        rnd = np.random.rand(bs)
+        bmask= rnd < self.batch_mask_freq/100
+        bmask = np.repeat(bmask, self.cond_frames)
+        return bmask
+    def _apply_mask(self, input, mask, values):
+        input[mask] = values
+        return input
+    def audio_unpatchify(self, x):
+        """
+        x: (N, T, patch_size * C)
+        audio: (N, N_mels, frames)
+        """
+        c = 1
+        p = self.audio_patch_size
+        h = int(self.n_mels//p)
+        w = int((self.audio_fr/1600)/p)
+        assert h * w == x.shape[1]
+        x = x.reshape(shape=(x.shape[0], h, w, p, p, c))
+        x = torch.einsum('nhwpqc->nchpwq', x)
+        audio = x.reshape(shape=(x.shape[0], c, h * p, w * p))
+        return audio
+    def forward(self, v, a, t, y):
+        """
+        Forward pass of FLAV.
+        v: (B, T, C, H, W) tensor of spatial inputs (images or latent representations of images)
+        a: (B, 1, n_bins, T) # mel spectrogram of audio
+        t: (B, T) tensor of diffusion timesteps
+        y: (B,) tensor of class labels
+        """
+        ### Video
+        B, T, C, H, W = v.shape
+        v = einops.rearrange(v, 'B T C H W -> (B T) C H W')
+        v = self.v_embedder(v)
+        v = einops.rearrange(v, '(B T) L D -> B T L D', T=T)
+        v = v + self.video_temporal_pos_embed + self.video_spatial_pos_embed
+        ### Audio
+        a = einops.rearrange(a, "B T C N F -> B T C F N").squeeze(2)
+        a = self.a_embedder(a)
+        a = a + self.audio_temporal_pos_embed + self.audio_spatial_pos_embed
+        ### Conditioning
+        t = t.view(-1)                  # B T -> (B T)
+        v_t = self.video_t_embedder(t)  # (B, T, D)
+        v_t = v_t.view(B, T, -1)        # (B T) D -> B T D
+        if self.num_classes > 0:
+            v_y = self.video_y_embedder(y, self.training) # (B, D)
+            v_y = v_y.unsqueeze(1).expand(-1, T, -1)      # (B, T, D)
+        v_c = (v_t + v_y) if self.num_classes > 0 else v_t  # (B, T, D)
+        a_t = self.audio_t_embedder(t) # (B, T, D)
+        a_t = a_t.view(B, T, -1)
+        if self.num_classes > 0:
+            a_y = self.audio_y_embedder(y, self.training)
+            a_y = a_y.unsqueeze(1).expand(-1, T, -1)
+        a_c = (a_t + a_y) if self.num_classes > 0 else a_t    # (B, T, D)
+        for block in self.blocks:
+            v, a = block(v, a, v_c, a_c)                      # (B, T, D)
+        v = self.video_final_layer(v, v_c)                # (B, T, patch_size ** 2 * out_channels), (B, T, L)
+        a = self.audio_final_layer(a, a_c)
+        v = einops.rearrange(v, 'B T L D -> (B T) L D', T = T)
+        v = self.unpatchify(v)                   # (B, out_channels, H, W)
+        v = einops.rearrange(v, '(B T) C H W -> B T C H W', T = T)
+        a = einops.rearrange(a, 'B T F N -> B T N F', T = T).unsqueeze(2)
+        return v, a
+    def forward_with_cfg(self, v, a, t, y, cfg_scale):
+        """
+        Forward pass of FLAV, but also batches the unconditional forward pass for classifier-free guidance.
+        """
+        v_combined = torch.cat([v, v], dim=0)
+        a_combined = torch.cat([a, a], dim=0)
+        y_null = torch.tensor([self.num_classes]*v.shape[0], device=v.device)
+        y = torch.cat([y, y_null], dim=0)
+        t = torch.cat([t, t], dim=0)
+        v_model_out, a_model_out = self.forward(v_combined, a_combined, t, y)
+        v_eps = v_model_out
+        a_eps = a_model_out
+        v_cond_eps, v_uncond_eps = torch.split(v_eps, len(v_eps) // 2, dim=0)
+        v_eps = v_uncond_eps + cfg_scale * (v_cond_eps - v_uncond_eps)
+        a_cond_eps, a_uncond_eps = torch.split(a_eps, len(a_eps) // 2, dim=0)
+        a_eps = a_uncond_eps + cfg_scale * (a_cond_eps - a_uncond_eps)
+        return v_eps, a_eps
+#################################################################################
+#                   Sine/Cosine Positional Embedding Functions                  #
+#################################################################################
+# https://github.com/facebookresearch/mae/blob/main/util/video_pos_embed.py
+def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False, extra_tokens=0):
+    """
+    grid_size: int of the grid height and width
+    return:
+    video_pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    grid_h = np.arange(grid_size, dtype=np.float32)
+    grid_w = np.arange(grid_size, dtype=np.float32)
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+    grid = grid.reshape([2, 1, grid_size, grid_size])
+    video_pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if cls_token and extra_tokens > 0:
+        video_pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), video_pos_embed], axis=0)
+    return video_pos_embed
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    assert embed_dim % 2 == 0
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
+    emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
+    return emb
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,)
+    out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=np.float64)
+    omega /= embed_dim / 2.
+    omega = 1. / 10000**omega  # (D/2,)
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
+    emb_sin = np.sin(out) # (M, D/2)
+    emb_cos = np.cos(out) # (M, D/2)
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+#################################################################################
+#                                   FLAV Configs                                  #
+#################################################################################
+def FLAV_XL_2(**kwargs):
+    return FLAV(depth=28, hidden_size=1152, patch_size=2, num_heads=16, **kwargs)
+def FLAV_XL_4(**kwargs):
+    return FLAV(depth=28, hidden_size=1152, patch_size=4, num_heads=16, **kwargs)
+def FLAV_XL_8(**kwargs):
+    return FLAV(depth=28, hidden_size=1152, patch_size=8, num_heads=16, **kwargs)
+# def FLAV_L_2(**kwargs):
+#     return FLAV(depth=24, hidden_size=1024, patch_size=2, num_heads=16, **kwargs)
+def FLAV_L_1(**kwargs):
+     return FLAV(depth=24, hidden_size=1024, patch_size=1, num_heads=16, **kwargs)
+def FLAV_L_2(**kwargs):
+    return FLAV(depth=20, hidden_size=1024, patch_size=2, num_heads=16, **kwargs)
+def FLAV_L_4(**kwargs):
+    return FLAV(depth=20, hidden_size=1024, patch_size=4, num_heads=16, **kwargs)
+def FLAV_L_8(**kwargs):
+    return FLAV(depth=20, hidden_size=1024, patch_size=8, num_heads=16, **kwargs)
+# def FLAV_B_2(**kwargs):
+#     return FLAV(depth=12, hidden_size=768, patch_size=2, num_heads=12, **kwargs)
+def FLAV_B_1(**kwargs):
+    return FLAV(depth=12, hidden_size=768, patch_size=1, num_heads=12, **kwargs)
+def FLAV_B_2(**kwargs):
+    return FLAV(depth=12, hidden_size=768, patch_size=2, num_heads=12, **kwargs)
+def FLAV_B_4(**kwargs):
+    return FLAV(depth=12, hidden_size=768, patch_size=4, num_heads=12, **kwargs)
+def FLAV_B_8(**kwargs):
+    return FLAV(depth=12, hidden_size=768, patch_size=8, num_heads=12, **kwargs)
+def FLAV_S_2(**kwargs):
+    return FLAV(depth=12, hidden_size=384, patch_size=2, num_heads=6, **kwargs)
+def FLAV_S_4(**kwargs):
+    return FLAV(depth=12, hidden_size=384, patch_size=4, num_heads=6, **kwargs)
+def FLAV_S_8(**kwargs):
+    return FLAV(depth=12, hidden_size=384, patch_size=8, num_heads=6, **kwargs)
+FLAV_models = {
+    'FLAV-XL/2':  FLAV_XL_2,  'FLAV-XL/4': FLAV_XL_4,  'FLAV-XL/8': FLAV_XL_8,
+    'FLAV-L/1' :  FLAV_L_1,   'FLAV-L/2':  FLAV_L_2,   'FLAV-L/4':  FLAV_L_4,   'FLAV-L/8':  FLAV_L_8,
+    'FLAV-B/1' :  FLAV_B_1,   'FLAV-B/2':  FLAV_B_2,   'FLAV-B/4':  FLAV_B_4,   'FLAV-B/8':  FLAV_B_8,
+    'FLAV-S/2' :  FLAV_S_2,   'FLAV-S/4':  FLAV_S_4,   'FLAV-S/8':  FLAV_S_8,
+}

utils.py ADDED Viewed

	@@ -0,0 +1,267 @@

+# from moviepy.video.io.ImageSequenceClip import ImageSequenceClip
+# # from moviepy.audio.AudioClip import AudioArrayClip
+# from moviepy.audio.io.AudioFileClip import AudioFileClip
+from torch.utils.data import DataLoader
+from dataset import AudioVideoDataset, LatentDataset
+import torch as th
+import numpy as np
+import einops
+from moviepy.audio.io.AudioFileClip import AudioFileClip
+from moviepy.video.io.ImageSequenceClip import ImageSequenceClip
+from diffusers.models import AutoencoderKL
+from converter import denormalize, denormalize_spectrogram
+import soundfile as sf
+import os
+import json
+import torch
+from tqdm import tqdm
+#################################################################################
+#                                  Video Utils                                  #
+#################################################################################
+def preprocess_video(video):
+    # video = 255*(video+1)/2.0 # [-1,1] -> [0,1] -> [0,255]
+    # video = th.clamp(video, 0, 255).to(dtype=th.uint8, device="cuda")
+    video = out2img(video)
+    video = einops.rearrange(video, 't c h w -> t h w c').cpu().numpy()
+    return video
+def preprocess_video_batch(videos):
+    B = videos.shape[0]
+    videos_prep = np.empty(B, dtype=np.ndarray)
+    for b in range(B):
+        videos_prep[b] = preprocess_video(videos[b])
+    videos_prep = np.stack(videos_prep, axis=0)
+    return videos_prep
+def save_latents(video, audio, y, output_path, name_prefix, ext=".pt"):
+    os.makedirs(output_path, exist_ok=True)
+    th.save(
+        {
+        "video":video,
+        "audio":audio,
+        "y":y
+        }, os.path.join(output_path, name_prefix + ext))
+def save_multimodal(video, audio, output_path, name_prefix, video_fps=10, audio_fps=16000, audio_dir=None):
+    if not audio_dir:
+        audio_dir = output_path
+    #prepare folders
+    audio_dir = os.path.join(audio_dir, "audio")
+    os.makedirs(audio_dir, exist_ok=True)
+    audio_path = os.path.join(audio_dir, name_prefix + "_audio.wav")
+    video_dir = os.path.join(output_path, "video")
+    os.makedirs(video_dir, exist_ok=True)
+    video_path = os.path.join(video_dir, name_prefix + "_video.mp4")
+    #save audio
+    sf.write(audio_path, audio, samplerate=audio_fps)
+    #save video
+    video = preprocess_video(video)
+    imgs = [img for img in video]
+    video_clip = ImageSequenceClip(imgs, fps=video_fps)
+    audio_clip = AudioFileClip(audio_path)
+    video_clip = video_clip.with_audio(audio_clip)
+    video_clip.write_videofile(video_path, video_fps, audio=True, audio_fps=audio_fps)
+def get_dataloader(args, logger, sequence_length, train, latents=False):
+    if latents:
+        train_set = LatentDataset(args.data_path, train=train)
+    else:
+        train_set = AudioVideoDataset(
+            args.data_path,
+            train=train,
+            sample_every_n_frames=1,
+            resolution=args.image_size,
+            sequence_length = sequence_length,
+            audio_channels = 1,
+            sample_rate=16000,
+            min_length=1,
+            ignore_cache=args.ignore_cache,
+            labeled=args.num_classes > 0,
+            target_video_fps=args.target_video_fps,
+        )
+    loader = DataLoader(
+        train_set,
+        batch_size=args.batch_size,
+        shuffle=True,
+        num_workers=args.num_workers,
+        pin_memory=True,
+        drop_last=True
+    )
+    if logger is not None:
+        logger.info(f'{"Train" if train else "Test"} Dataset contains {len(train_set)}, images ({args.data_path})')
+    else:
+        print(f'{"Train" if train else "Test"} Dataset contains {len(train_set)}, images ({args.data_path})')
+    return loader
+@torch.no_grad()
+def encode_video(video, vae, use_sd_vae = False):
+    b, t, c, h, w = video.shape
+    video = einops.rearrange(video, "b t c h w-> (b t) c h w")
+    if use_sd_vae:
+        video = vae.encode(video).latent_dist.sample().mul_(0.18215)
+    else:
+        video = vae.encode(video)*vae.cfg.scaling_factor
+    video = einops.rearrange(video, "(b t) c h w -> b t c h w", t=t)
+    return video
+@torch.no_grad()
+def decode_video(video, vae):
+    b = video.shape[0]
+    video_decoded = []
+    video = einops.rearrange(video, "b t c h w -> (b t) c h w")
+    #use minibatch to avoid memory error
+    for i in range(0, video.shape[0], b):
+        if isinstance(vae, AutoencoderKL):
+            video_decoded.append(vae.decode(video[i:i+b] / 0.18215).sample.detach().cpu())
+        else:
+            video_decoded.append(vae.decode(video[i:i+b] / vae.cfg.scaling_factor).detach().cpu())
+    video = torch.cat(video_decoded, dim=0)
+    video = einops.rearrange(video, "(b t) c h w ->b t c h w",b=b)
+    return video
+def generate_sample(vae,
+                    rectified_flow,
+                    forward_fn,
+                    video_length,
+                    video_latent_size,
+                    audio_latent_size,
+                    y,
+                    cfg_scale,
+                    device):
+    with torch.no_grad():
+        v_z = torch.randn(video_latent_size, device=device)*rectified_flow.noise_scale
+        a_z = torch.randn(audio_latent_size, device=device)*rectified_flow.noise_scale
+        model_kwargs = dict(y=y, cfg_scale=cfg_scale) if cfg_scale else dict(y=y)
+        sample_fn = rectified_flow.sample(
+                    forward_fn, v_z, a_z, model_kwargs=model_kwargs, progress=True)()
+        video = []
+        audio = []
+        for _ in tqdm(range(video_length), desc="Generating frames"):
+            video_samples, audio_samples = next(sample_fn)
+            video.append(video_samples)
+            audio.append(audio_samples)
+        video = torch.stack(video, dim=1)
+        audio = torch.stack(audio, dim=1)
+        video = decode_video(video, vae)
+        audio = einops.rearrange(audio, "B T C N F -> B C N (T F)")
+        return video, audio
+def generate_sample_a2v(vae,
+                    rectified_flow,
+                    forward_fn,
+                    video_length,
+                    video_latent_size,
+                    audio,
+                    y,
+                    device,
+                    cfg_scale=1,
+                    scale=1):
+    v_z = torch.randn(video_latent_size, device=device)*rectified_flow.noise_scale
+    model_kwargs = dict(y=y, cfg_scale=cfg_scale) if cfg_scale else dict(y=y)
+    sample_fn = rectified_flow.sample_a2v(
+                forward_fn, v_z, audio, model_kwargs=model_kwargs, scale=scale, progress=True)()
+    video = []
+    for i in tqdm(range(video_length), desc="Generating frames"):
+        video_samples = next(sample_fn)
+        video.append(video_samples)
+    video = torch.stack(video, dim=1)
+    video = decode_video(video, vae)
+    audio = einops.rearrange(audio, "B T C N F -> B C N (T F)")
+    return video, audio
+def generate_sample_v2a(vae,
+                    rectified_flow,
+                    forward_fn,
+                    video_length,
+                    video,
+                    audio_latent_size,
+                    y,
+                    device,
+                    cfg_scale=1,
+                    scale=1):
+    a_z = torch.randn(audio_latent_size, device=device)*rectified_flow.noise_scale
+    model_kwargs = dict(y=y, cfg_scale=cfg_scale) if cfg_scale else dict(y=y)
+    sample_fn = rectified_flow.sample_v2a(
+                forward_fn, video, a_z, model_kwargs=model_kwargs, scale=scale, progress=True)()
+    audio = []
+    for i in tqdm(range(video_length), desc="Generating frames"):
+        audio_samples = next(sample_fn)
+        audio.append(audio_samples)
+    audio = torch.stack(audio, dim=1)
+    video = decode_video(video, vae)
+    audio = einops.rearrange(audio, "B T C N F -> B C N (T F)")
+    return video, audio
+def dict_to_json(path, args):
+    with open(path, 'w') as f:
+        json.dump(args.__dict__, f, indent=2)
+def json_to_dict(path, args):
+    with open(path, 'r') as f:
+        args.__dict__ = json.load(f)
+    return args
+def log_args(args, logger):
+    text = ""
+    for k, v in vars(args).items():
+        text += f'{k}={v}\n'
+    logger.info(f"##### ARGS #####\n{text}")
+def out2img(samples):
+    return th.clamp(127.5 * samples + 128.0, 0, 255).to(
+        dtype=th.uint8
+    ).cuda()
+def get_gpu_usage():
+    device = th.device('cuda:0')
+    free, total = th.cuda.mem_get_info(device)
+    mem_used_MB = (total - free) / 1024 ** 2
+    return mem_used_MB
+def get_wavs(norm_spec, vocoder, audio_scale, device):
+    norm_spec = norm_spec.squeeze(1)
+    norm_spec = norm_spec / audio_scale
+    post_norm_spec = denormalize(norm_spec).to(device)
+    raw_chunk_spec = denormalize_spectrogram(post_norm_spec)
+    wavs = vocoder.inference(raw_chunk_spec)
+    return wavs