diff --git a/app.py b/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..e4a29e9483bbcf217cc2d89f37c6e9d04a01ab80
--- /dev/null
+++ b/app.py
@@ -0,0 +1,77 @@
+import gradio as gr
+import os
+import json
+import torch
+import soundfile as sf
+import numpy as np
+from pathlib import Path
+from transformers import AutoModel
+#from utils.llm import get_time_info
+from utils.llm_xiapi import get_time_info
+
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model = AutoModel.from_pretrained("rookie9/PicoAudio2", trust_remote_code=True).to(device)
+print("ok")
+def is_tdc_format_valid(tdc_str):
+    try:
+        for event_onset in tdc_str.split('--'):
+            event, instance = event_onset.split('__')
+            for start_end in instance.split('_'):
+                start, end = start_end.split('-')
+        return True
+    except Exception:
+        return False
+
+def infer(input_text, input_onset, input_length, time_control):
+    # para
+    if input_onset and not is_tdc_format_valid(input_onset):
+        input_onset = "random"
+    if time_control:
+        if not input_onset or not input_length:
+            input_json = json.loads(get_time_info(input_text))
+            input_onset, input_length = input_json["onset"], input_json["length"]
+    else:
+        input_onset = input_onset if input_onset else "random"
+        input_length = input_length if input_length else "10.0"
+
+    content = {
+        "caption": input_text,
+        "onset": input_onset,
+        "length": input_length
+    }
+    
+
+    with torch.no_grad():
+        waveform = model(content)
+        output_wav = "output.wav"
+        sf.write(
+            output_wav,
+            waveform[0, 0].cpu().numpy(),
+            samplerate=exp_config["sample_rate"],
+        )
+    return output_wav, str(input_onset)
+
+demo = gr.Interface(
+    fn=infer,
+    inputs=[
+        gr.Textbox(label="TCC (caption, required)", value="a dog barks"),
+        gr.Textbox(label="TDC (optional, see format)", value="random"),
+        gr.Textbox(label="Length (seconds, optional)", value="10.0"),
+        gr.Checkbox(label="Enable Time Control", value=False),
+    ],
+    outputs=[
+        gr.Audio(label="Generated Audio"),
+        gr.Textbox(label="Final TDC Used (input_onset)")
+    ],
+    title="PicoAudio2 Online Inference",
+    description=(
+        "TCC (caption) is neto generate audio. "
+        "If you need time control, please enter TDC and length (in seconds). "
+        "Alternatively, you can let the LLM generate TDC, but API quota limits may affect availability. "
+        "TDC format: \"event1__start1-end1_start2-end2--event2__start1-end1\", for example: "
+        "\"a_dog_barks__1.0-2.0_3.0-4.0--a_man_speaks__5.0-6.0\"."
+        "If the format of TDC is wrong or no input length, the model will generate audio without temporal control. Sorry!"
+    )
+)
+if __name__ == "__main__":
+    demo.launch()
\ No newline at end of file
diff --git a/models/__pycache__/common.cpython-310.pyc b/models/__pycache__/common.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e2945374926290bb58fd9d6af7dd55961592e3bf
Binary files /dev/null and b/models/__pycache__/common.cpython-310.pyc differ
diff --git a/models/__pycache__/content_adapter.cpython-310.pyc b/models/__pycache__/content_adapter.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d3de9f12f7f15396218a2177b2b598c30ab6cdc7
Binary files /dev/null and b/models/__pycache__/content_adapter.cpython-310.pyc differ
diff --git a/models/__pycache__/diffusion.cpython-310.pyc b/models/__pycache__/diffusion.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ea2335bb198a2d9eb6c9f3adf8792caf2fe68dca
Binary files /dev/null and b/models/__pycache__/diffusion.cpython-310.pyc differ
diff --git a/models/__pycache__/diffusion_cfg.cpython-310.pyc b/models/__pycache__/diffusion_cfg.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6cb083c6ebda48738fc5885dcc5b1d620a667632
Binary files /dev/null and b/models/__pycache__/diffusion_cfg.cpython-310.pyc differ
diff --git a/models/__pycache__/diffusion_cfg_new.cpython-310.pyc b/models/__pycache__/diffusion_cfg_new.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8946abbbfe664f7553d7545d28fcfa0812b32057
Binary files /dev/null and b/models/__pycache__/diffusion_cfg_new.cpython-310.pyc differ
diff --git a/models/__pycache__/diffusion_content_cfg.cpython-310.pyc b/models/__pycache__/diffusion_content_cfg.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cb3b0cca20cd3c06339da4b0a57617a086f4eab6
Binary files /dev/null and b/models/__pycache__/diffusion_content_cfg.cpython-310.pyc differ
diff --git a/models/autoencoder/__pycache__/autoencoder_base.cpython-310.pyc b/models/autoencoder/__pycache__/autoencoder_base.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4be9a8bcbcc162270f0cd469a80c82b148df1694
Binary files /dev/null and b/models/autoencoder/__pycache__/autoencoder_base.cpython-310.pyc differ
diff --git a/models/autoencoder/autoencoder_base.py b/models/autoencoder/autoencoder_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..2852ad185b48e9595e116735baed689fa09cc0d3
--- /dev/null
+++ b/models/autoencoder/autoencoder_base.py
@@ -0,0 +1,22 @@
+from abc import abstractmethod, ABC
+from typing import Sequence
+import torch
+import torch.nn as nn
+
+
+class AutoEncoderBase(ABC):
+    def __init__(
+        self, downsampling_ratio: int, sample_rate: int,
+        latent_shape: Sequence[int | None]
+    ):
+        self.downsampling_ratio = downsampling_ratio
+        self.sample_rate = sample_rate
+        self.latent_token_rate = sample_rate // downsampling_ratio
+        self.latent_shape = latent_shape
+        self.time_dim = latent_shape.index(None) + 1  # the first dim is batch
+
+    @abstractmethod
+    def encode(
+        self, waveform: torch.Tensor, waveform_lengths: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        ...
diff --git a/models/autoencoder/waveform/__pycache__/stable_vae.cpython-310.pyc b/models/autoencoder/waveform/__pycache__/stable_vae.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2b167a38dcd8cbac4dd85da41050a0ec3cbea454
Binary files /dev/null and b/models/autoencoder/waveform/__pycache__/stable_vae.cpython-310.pyc differ
diff --git a/models/autoencoder/waveform/stable_vae.py b/models/autoencoder/waveform/stable_vae.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7373ec13aca61a57dd5849735a745cf88b39e68
--- /dev/null
+++ b/models/autoencoder/waveform/stable_vae.py
@@ -0,0 +1,537 @@
+from typing import Any, Literal, Callable
+import math
+from pathlib import Path
+
+import torch
+import torch.nn as nn
+from torch.nn.utils.parametrizations import weight_norm
+import torchaudio
+from alias_free_torch import Activation1d
+
+from models.common import LoadPretrainedBase
+from models.autoencoder.autoencoder_base import AutoEncoderBase
+from utils.torch_utilities import remove_key_prefix_factory, create_mask_from_length
+
+
+# jit script make it 1.4x faster and save GPU memory
+@torch.jit.script
+def snake_beta(x, alpha, beta):
+    return x + (1.0 / (beta+0.000000001)) * pow(torch.sin(x * alpha), 2)
+
+
+class SnakeBeta(nn.Module):
+    def __init__(
+        self,
+        in_features,
+        alpha=1.0,
+        alpha_trainable=True,
+        alpha_logscale=True
+    ):
+        super(SnakeBeta, self).__init__()
+        self.in_features = in_features
+
+        # initialize alpha
+        self.alpha_logscale = alpha_logscale
+        if self.alpha_logscale:
+            # log scale alphas initialized to zeros
+            self.alpha = nn.Parameter(torch.zeros(in_features) * alpha)
+            self.beta = nn.Parameter(torch.zeros(in_features) * alpha)
+        else:
+            # linear scale alphas initialized to ones
+            self.alpha = nn.Parameter(torch.ones(in_features) * alpha)
+            self.beta = nn.Parameter(torch.ones(in_features) * alpha)
+
+        self.alpha.requires_grad = alpha_trainable
+        self.beta.requires_grad = alpha_trainable
+
+        # self.no_div_by_zero = 0.000000001
+
+    def forward(self, x):
+        alpha = self.alpha.unsqueeze(0).unsqueeze(-1)
+        # line up with x to [B, C, T]
+        beta = self.beta.unsqueeze(0).unsqueeze(-1)
+        if self.alpha_logscale:
+            alpha = torch.exp(alpha)
+            beta = torch.exp(beta)
+        x = snake_beta(x, alpha, beta)
+
+        return x
+
+
+def WNConv1d(*args, **kwargs):
+    return weight_norm(nn.Conv1d(*args, **kwargs))
+
+
+def WNConvTranspose1d(*args, **kwargs):
+    return weight_norm(nn.ConvTranspose1d(*args, **kwargs))
+
+
+def get_activation(
+    activation: Literal["elu", "snake", "none"],
+    antialias=False,
+    channels=None
+) -> nn.Module:
+    if activation == "elu":
+        act = nn.ELU()
+    elif activation == "snake":
+        act = SnakeBeta(channels)
+    elif activation == "none":
+        act = nn.Identity()
+    else:
+        raise ValueError(f"Unknown activation {activation}")
+
+    if antialias:
+        act = Activation1d(act)
+
+    return act
+
+
+class ResidualUnit(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        dilation,
+        use_snake=False,
+        antialias_activation=False
+    ):
+        super().__init__()
+
+        self.dilation = dilation
+
+        padding = (dilation * (7-1)) // 2
+
+        self.layers = nn.Sequential(
+            get_activation(
+                "snake" if use_snake else "elu",
+                antialias=antialias_activation,
+                channels=out_channels
+            ),
+            WNConv1d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=7,
+                dilation=dilation,
+                padding=padding
+            ),
+            get_activation(
+                "snake" if use_snake else "elu",
+                antialias=antialias_activation,
+                channels=out_channels
+            ),
+            WNConv1d(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                kernel_size=1
+            )
+        )
+
+    def forward(self, x):
+        res = x
+
+        #x = checkpoint(self.layers, x)
+        x = self.layers(x)
+
+        return x + res
+
+
+class EncoderBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        stride,
+        use_snake=False,
+        antialias_activation=False
+    ):
+        super().__init__()
+
+        self.layers = nn.Sequential(
+            ResidualUnit(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                dilation=1,
+                use_snake=use_snake
+            ),
+            ResidualUnit(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                dilation=3,
+                use_snake=use_snake
+            ),
+            ResidualUnit(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                dilation=9,
+                use_snake=use_snake
+            ),
+            get_activation(
+                "snake" if use_snake else "elu",
+                antialias=antialias_activation,
+                channels=in_channels
+            ),
+            WNConv1d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=2 * stride,
+                stride=stride,
+                padding=math.ceil(stride / 2)
+            ),
+        )
+
+    def forward(self, x):
+        return self.layers(x)
+
+
+class DecoderBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        stride,
+        use_snake=False,
+        antialias_activation=False,
+        use_nearest_upsample=False
+    ):
+        super().__init__()
+
+        if use_nearest_upsample:
+            upsample_layer = nn.Sequential(
+                nn.Upsample(scale_factor=stride, mode="nearest"),
+                WNConv1d(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    kernel_size=2 * stride,
+                    stride=1,
+                    bias=False,
+                    padding='same'
+                )
+            )
+        else:
+            upsample_layer = WNConvTranspose1d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=2 * stride,
+                stride=stride,
+                padding=math.ceil(stride / 2)
+            )
+
+        self.layers = nn.Sequential(
+            get_activation(
+                "snake" if use_snake else "elu",
+                antialias=antialias_activation,
+                channels=in_channels
+            ),
+            upsample_layer,
+            ResidualUnit(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                dilation=1,
+                use_snake=use_snake
+            ),
+            ResidualUnit(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                dilation=3,
+                use_snake=use_snake
+            ),
+            ResidualUnit(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                dilation=9,
+                use_snake=use_snake
+            ),
+        )
+
+    def forward(self, x):
+        return self.layers(x)
+
+
+class OobleckEncoder(nn.Module):
+    def __init__(
+        self,
+        in_channels=2,
+        channels=128,
+        latent_dim=32,
+        c_mults=[1, 2, 4, 8],
+        strides=[2, 4, 8, 8],
+        use_snake=False,
+        antialias_activation=False
+    ):
+        super().__init__()
+
+        c_mults = [1] + c_mults
+
+        self.depth = len(c_mults)
+
+        layers = [
+            WNConv1d(
+                in_channels=in_channels,
+                out_channels=c_mults[0] * channels,
+                kernel_size=7,
+                padding=3
+            )
+        ]
+
+        for i in range(self.depth - 1):
+            layers += [
+                EncoderBlock(
+                    in_channels=c_mults[i] * channels,
+                    out_channels=c_mults[i + 1] * channels,
+                    stride=strides[i],
+                    use_snake=use_snake
+                )
+            ]
+
+        layers += [
+            get_activation(
+                "snake" if use_snake else "elu",
+                antialias=antialias_activation,
+                channels=c_mults[-1] * channels
+            ),
+            WNConv1d(
+                in_channels=c_mults[-1] * channels,
+                out_channels=latent_dim,
+                kernel_size=3,
+                padding=1
+            )
+        ]
+
+        self.layers = nn.Sequential(*layers)
+
+    def forward(self, x):
+        return self.layers(x)
+
+
+class OobleckDecoder(nn.Module):
+    def __init__(
+        self,
+        out_channels=2,
+        channels=128,
+        latent_dim=32,
+        c_mults=[1, 2, 4, 8],
+        strides=[2, 4, 8, 8],
+        use_snake=False,
+        antialias_activation=False,
+        use_nearest_upsample=False,
+        final_tanh=True
+    ):
+        super().__init__()
+
+        c_mults = [1] + c_mults
+
+        self.depth = len(c_mults)
+
+        layers = [
+            WNConv1d(
+                in_channels=latent_dim,
+                out_channels=c_mults[-1] * channels,
+                kernel_size=7,
+                padding=3
+            ),
+        ]
+
+        for i in range(self.depth - 1, 0, -1):
+            layers += [
+                DecoderBlock(
+                    in_channels=c_mults[i] * channels,
+                    out_channels=c_mults[i - 1] * channels,
+                    stride=strides[i - 1],
+                    use_snake=use_snake,
+                    antialias_activation=antialias_activation,
+                    use_nearest_upsample=use_nearest_upsample
+                )
+            ]
+
+        layers += [
+            get_activation(
+                "snake" if use_snake else "elu",
+                antialias=antialias_activation,
+                channels=c_mults[0] * channels
+            ),
+            WNConv1d(
+                in_channels=c_mults[0] * channels,
+                out_channels=out_channels,
+                kernel_size=7,
+                padding=3,
+                bias=False
+            ),
+            nn.Tanh() if final_tanh else nn.Identity()
+        ]
+
+        self.layers = nn.Sequential(*layers)
+
+    def forward(self, x):
+        return self.layers(x)
+
+
+class Bottleneck(nn.Module):
+    def __init__(self, is_discrete: bool = False):
+        super().__init__()
+
+        self.is_discrete = is_discrete
+
+    def encode(self, x, return_info=False, **kwargs):
+        raise NotImplementedError
+
+    def decode(self, x):
+        raise NotImplementedError
+
+
+@torch.jit.script
+def vae_sample(mean, scale) -> dict[str, torch.Tensor]:
+    stdev = nn.functional.softplus(scale) + 1e-4
+    var = stdev * stdev
+    logvar = torch.log(var)
+    latents = torch.randn_like(mean) * stdev + mean
+
+    kl = (mean*mean + var - logvar - 1).sum(1).mean()
+    return {"latents": latents, "kl": kl}
+
+
+class VAEBottleneck(Bottleneck):
+    def __init__(self):
+        super().__init__(is_discrete=False)
+
+    def encode(self,
+               x,
+               return_info=False,
+               **kwargs) -> dict[str, torch.Tensor] | torch.Tensor:
+        mean, scale = x.chunk(2, dim=1)
+        sampled = vae_sample(mean, scale)
+
+        if return_info:
+            return sampled["latents"], {"kl": sampled["kl"]}
+        else:
+            return sampled["latents"]
+
+    def decode(self, x):
+        return x
+
+
+def compute_mean_kernel(x, y):
+    kernel_input = (x[:, None] - y[None]).pow(2).mean(2) / x.shape[-1]
+    return torch.exp(-kernel_input).mean()
+
+
+class Pretransform(nn.Module):
+    def __init__(self, enable_grad, io_channels, is_discrete):
+        super().__init__()
+
+        self.is_discrete = is_discrete
+        self.io_channels = io_channels
+        self.encoded_channels = None
+        self.downsampling_ratio = None
+
+        self.enable_grad = enable_grad
+
+    def encode(self, x):
+        raise NotImplementedError
+
+    def decode(self, z):
+        raise NotImplementedError
+
+    def tokenize(self, x):
+        raise NotImplementedError
+
+    def decode_tokens(self, tokens):
+        raise NotImplementedError
+
+
+class StableVAE(LoadPretrainedBase, AutoEncoderBase):
+    def __init__(
+        self,
+        encoder,
+        decoder,
+        latent_dim,
+        downsampling_ratio,
+        sample_rate,
+        io_channels=2,
+        bottleneck: Bottleneck = None,
+        pretransform: Pretransform = None,
+        in_channels=None,
+        out_channels=None,
+        soft_clip=False,
+        pretrained_ckpt: str | Path = None
+    ):
+        LoadPretrainedBase.__init__(self)
+        AutoEncoderBase.__init__(
+            self,
+            downsampling_ratio=downsampling_ratio,
+            sample_rate=sample_rate,
+            latent_shape=(latent_dim, None)
+        )
+
+        self.latent_dim = latent_dim
+        self.io_channels = io_channels
+        self.in_channels = io_channels
+        self.out_channels = io_channels
+        self.min_length = self.downsampling_ratio
+
+        if in_channels is not None:
+            self.in_channels = in_channels
+
+        if out_channels is not None:
+            self.out_channels = out_channels
+
+        self.bottleneck = bottleneck
+        self.encoder = encoder
+        self.decoder = decoder
+        self.pretransform = pretransform
+        self.soft_clip = soft_clip
+        self.is_discrete = self.bottleneck is not None and self.bottleneck.is_discrete
+
+        self.remove_autoencoder_prefix_fn: Callable = remove_key_prefix_factory(
+            "autoencoder."
+        )
+        if pretrained_ckpt is not None:
+            self.load_pretrained(pretrained_ckpt)
+
+    def process_state_dict(self, model_dict, state_dict):
+        state_dict = state_dict["state_dict"]
+        state_dict = self.remove_autoencoder_prefix_fn(model_dict, state_dict)
+        return state_dict
+
+    def encode(
+        self, waveform: torch.Tensor, waveform_lengths: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        z = self.encoder(waveform)
+        z = self.bottleneck.encode(z)
+        z_length = waveform_lengths // self.downsampling_ratio
+        z_mask = create_mask_from_length(z_length)
+        return z, z_mask
+
+    def decode(self, latents: torch.Tensor) -> torch.Tensor:
+        waveform = self.decoder(latents)
+        return waveform
+
+
+if __name__ == '__main__':
+    import hydra
+    from utils.config import generate_config_from_command_line_overrides
+    model_config = generate_config_from_command_line_overrides(
+        "configs/model/autoencoder/stable_vae.yaml"
+    )
+    autoencoder: StableVAE = hydra.utils.instantiate(model_config)
+    autoencoder.eval()
+
+    waveform, sr = torchaudio.load(
+        "/hpc_stor03/sjtu_home/xuenan.xu/workspace/singing_voice_synthesis/diffsinger/data/raw/opencpop/segments/wavs/2007000230.wav"
+    )
+    waveform = torchaudio.functional.resample(
+        waveform, sr, model_config["sample_rate"]
+    )
+    print("waveform: ", waveform.shape)
+    with torch.no_grad():
+        latent, latent_length = autoencoder.encode(
+            waveform, torch.as_tensor([waveform.shape[-1]])
+        )
+        print("latent: ", latent.shape)
+        reconstructed = autoencoder.decode(latent)
+        print("reconstructed: ", reconstructed.shape)
+    import soundfile as sf
+    sf.write(
+        "./reconstructed.wav",
+        reconstructed[0, 0].numpy(),
+        samplerate=model_config["sample_rate"]
+    )
diff --git a/models/common.py b/models/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..a4832f8fb67dda9cb608f6874b2b4f17ac20442a
--- /dev/null
+++ b/models/common.py
@@ -0,0 +1,69 @@
+from pathlib import Path
+import torch
+import torch.nn as nn
+from utils.torch_utilities import load_pretrained_model, merge_matched_keys
+import warnings
+
+class LoadPretrainedBase(nn.Module):
+    def process_state_dict(
+        self, model_dict: dict[str, torch.Tensor],
+        state_dict: dict[str, torch.Tensor]
+    ):
+        """
+        Custom processing functions of each model that transforms `state_dict` loaded from 
+        checkpoints to the state that can be used in `load_state_dict`.
+        Use `merge_mathced_keys` to update parameters with matched names and shapes by 
+        default.  
+
+        Args
+            model_dict:
+                The state dict of the current model, which is going to load pretrained parameters
+            state_dict:
+                A dictionary of parameters from a pre-trained model.
+
+            Returns:
+                dict[str, torch.Tensor]:
+                    The updated state dict, where parameters with matched keys and shape are 
+                    updated with values in `state_dict`.      
+        """
+        state_dict = merge_matched_keys(model_dict, state_dict)
+        return state_dict
+
+    def load_pretrained(self, ckpt_path: str | Path):
+        load_pretrained_model(
+            self, ckpt_path, state_dict_process_fn=self.process_state_dict
+        )
+
+
+class CountParamsBase(nn.Module):
+    def count_params(self):
+        num_params = 0
+        trainable_params = 0
+        for param in self.parameters():
+            num_params += param.numel()
+            if param.requires_grad:
+                trainable_params += param.numel()
+        return num_params, trainable_params
+
+
+class SaveTrainableParamsBase(nn.Module):
+    @property
+    def param_names_to_save(self):
+        names = []
+        for name, param in self.named_parameters():
+            if param.requires_grad:
+                names.append(name)
+        for name, _ in self.named_buffers():
+            names.append(name)
+        return names
+
+    def load_state_dict(self, state_dict, strict=True, assign=True):
+        print("State dict keys:", list(state_dict.keys()))
+        #for key in self.param_names_to_save:
+        #    if key not in state_dict:
+        #        raise Exception(
+        #            f"{key} not found in either pre-trained models (e.g. BERT)"
+        #            " or resumed checkpoints (e.g. epoch_40/model.pt)"
+        #        )
+        # 兼容 PyTorch/transformers 的 assign 参数
+        return super().load_state_dict(state_dict, strict=strict, assign=assign)
\ No newline at end of file
diff --git a/models/content_encoder/__pycache__/caption_encoder.cpython-310.pyc b/models/content_encoder/__pycache__/caption_encoder.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5e0552905613763ed5a88794610d9dbe05a10131
Binary files /dev/null and b/models/content_encoder/__pycache__/caption_encoder.cpython-310.pyc differ
diff --git a/models/content_encoder/__pycache__/content_encoder.cpython-310.pyc b/models/content_encoder/__pycache__/content_encoder.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..33efd478b08e3f719abf053f4f520eb7302f5371
Binary files /dev/null and b/models/content_encoder/__pycache__/content_encoder.cpython-310.pyc differ
diff --git a/models/content_encoder/__pycache__/content_encoder_add_1024.cpython-310.pyc b/models/content_encoder/__pycache__/content_encoder_add_1024.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d40ca4898f7470a736c2a171a1e38a7d2ab9d9ca
Binary files /dev/null and b/models/content_encoder/__pycache__/content_encoder_add_1024.cpython-310.pyc differ
diff --git a/models/content_encoder/__pycache__/content_encoder_clap.cpython-310.pyc b/models/content_encoder/__pycache__/content_encoder_clap.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..79adb09e0091180f1bc35847071909812d972d1d
Binary files /dev/null and b/models/content_encoder/__pycache__/content_encoder_clap.cpython-310.pyc differ
diff --git a/models/content_encoder/__pycache__/content_encoder_clap_test.cpython-310.pyc b/models/content_encoder/__pycache__/content_encoder_clap_test.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e012f9e93ee6c113928c4dc360d46da1972f18fa
Binary files /dev/null and b/models/content_encoder/__pycache__/content_encoder_clap_test.cpython-310.pyc differ
diff --git a/models/content_encoder/__pycache__/content_encoder_concat.cpython-310.pyc b/models/content_encoder/__pycache__/content_encoder_concat.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cf6a282b5699a8cd3326d6fdc3b92909399af824
Binary files /dev/null and b/models/content_encoder/__pycache__/content_encoder_concat.cpython-310.pyc differ
diff --git a/models/content_encoder/__pycache__/content_encoder_concat_4096.cpython-310.pyc b/models/content_encoder/__pycache__/content_encoder_concat_4096.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2eb75653d1b41deef2d121517f4ef456f9c0c1ac
Binary files /dev/null and b/models/content_encoder/__pycache__/content_encoder_concat_4096.cpython-310.pyc differ
diff --git a/models/content_encoder/__pycache__/content_encoder_concat_4096_random.cpython-310.pyc b/models/content_encoder/__pycache__/content_encoder_concat_4096_random.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5d333ccde4745c521f0e667eb16c19dc084e9f03
Binary files /dev/null and b/models/content_encoder/__pycache__/content_encoder_concat_4096_random.cpython-310.pyc differ
diff --git a/models/content_encoder/__pycache__/content_encoder_full.cpython-310.pyc b/models/content_encoder/__pycache__/content_encoder_full.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9e7b41dfaf5f32cd73c6cb0b99541b9f1ddc042a
Binary files /dev/null and b/models/content_encoder/__pycache__/content_encoder_full.cpython-310.pyc differ
diff --git a/models/content_encoder/__pycache__/content_encoder_full_non.cpython-310.pyc b/models/content_encoder/__pycache__/content_encoder_full_non.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..19b96e898a962712ea8ae84b21ce7a24613d1d4c
Binary files /dev/null and b/models/content_encoder/__pycache__/content_encoder_full_non.cpython-310.pyc differ
diff --git a/models/content_encoder/__pycache__/content_encoder_full_non_test.cpython-310.pyc b/models/content_encoder/__pycache__/content_encoder_full_non_test.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..179970ea364694e59ca267f3858b339c3720c21d
Binary files /dev/null and b/models/content_encoder/__pycache__/content_encoder_full_non_test.cpython-310.pyc differ
diff --git a/models/content_encoder/__pycache__/content_encoder_full_test.cpython-310.pyc b/models/content_encoder/__pycache__/content_encoder_full_test.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f97e423fc2c523ff9fc6272dc96757eee3dd34dd
Binary files /dev/null and b/models/content_encoder/__pycache__/content_encoder_full_test.cpython-310.pyc differ
diff --git a/models/content_encoder/__pycache__/content_encoder_full_woonset.cpython-310.pyc b/models/content_encoder/__pycache__/content_encoder_full_woonset.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..37952ea021a63ed12e1ed07d1073cb2f0abdee00
Binary files /dev/null and b/models/content_encoder/__pycache__/content_encoder_full_woonset.cpython-310.pyc differ
diff --git a/models/content_encoder/__pycache__/content_encoder_merge.cpython-310.pyc b/models/content_encoder/__pycache__/content_encoder_merge.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3ab920c0c1e5ced61a6b1856cf5305c7a2b2746e
Binary files /dev/null and b/models/content_encoder/__pycache__/content_encoder_merge.cpython-310.pyc differ
diff --git a/models/content_encoder/__pycache__/content_encoder_merge_test.cpython-310.pyc b/models/content_encoder/__pycache__/content_encoder_merge_test.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..02b660998b3b2bc248603d52ffbdf8ba1b0a83ad
Binary files /dev/null and b/models/content_encoder/__pycache__/content_encoder_merge_test.cpython-310.pyc differ
diff --git a/models/content_encoder/__pycache__/content_encoder_replace.cpython-310.pyc b/models/content_encoder/__pycache__/content_encoder_replace.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4e3d2b38176b73a4dc3683a7b9b990fb05b44157
Binary files /dev/null and b/models/content_encoder/__pycache__/content_encoder_replace.cpython-310.pyc differ
diff --git a/models/content_encoder/__pycache__/content_encoder_replace_merge.cpython-310.pyc b/models/content_encoder/__pycache__/content_encoder_replace_merge.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fd5b37a17053da1f29a3c9d93352ba3f76dcff51
Binary files /dev/null and b/models/content_encoder/__pycache__/content_encoder_replace_merge.cpython-310.pyc differ
diff --git a/models/content_encoder/__pycache__/content_encoder_replace_new.cpython-310.pyc b/models/content_encoder/__pycache__/content_encoder_replace_new.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..731dd53850c2d82afc8aa66a3d0e2e6711dd41c5
Binary files /dev/null and b/models/content_encoder/__pycache__/content_encoder_replace_new.cpython-310.pyc differ
diff --git a/models/content_encoder/__pycache__/content_encoder_test.cpython-310.pyc b/models/content_encoder/__pycache__/content_encoder_test.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3cfd15e674bfb421c1f7516ff6e9b594ccfc11ce
Binary files /dev/null and b/models/content_encoder/__pycache__/content_encoder_test.cpython-310.pyc differ
diff --git a/models/content_encoder/__pycache__/content_test.cpython-310.pyc b/models/content_encoder/__pycache__/content_test.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..05a6d2efc1a9bb64543023ba3f08c3e389bc6e6b
Binary files /dev/null and b/models/content_encoder/__pycache__/content_test.cpython-310.pyc differ
diff --git a/models/content_encoder/__pycache__/new_content_encoder.cpython-310.pyc b/models/content_encoder/__pycache__/new_content_encoder.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..95b289a672b9819bccf7cb334e3f93ffe02b53b5
Binary files /dev/null and b/models/content_encoder/__pycache__/new_content_encoder.cpython-310.pyc differ
diff --git a/models/content_encoder/__pycache__/text_encoder.cpython-310.pyc b/models/content_encoder/__pycache__/text_encoder.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..08cfbf6e6a50215a83462c23f932331b30eee53a
Binary files /dev/null and b/models/content_encoder/__pycache__/text_encoder.cpython-310.pyc differ
diff --git a/models/content_encoder/caption_encoder.py b/models/content_encoder/caption_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..967d59bd4fe1453959d7d2493718f19eb888b18d
--- /dev/null
+++ b/models/content_encoder/caption_encoder.py
@@ -0,0 +1,116 @@
+from typing import Any
+import torch
+import torch.nn as nn
+import random
+from utils.audiotime_event_merge import replace_event_synonyms
+
+def decode_data(line_onset_str, latent_length):
+    """
+    Extracts a timestamp matrix (event onset indices) from a formatted onset string.
+
+    Args:
+        line_onset_str (str): String containing event names and onset intervals,
+            formatted like "event1__start1-end1_start2-end2--event2__start1-end1".
+        latent_length (int): Length of the output matrix.
+
+    Returns:
+        line_onset_index (torch.Tensor): Matrix of shape [4, latent_length], 
+        line_event (list): List of event names extracted from the onset string.
+
+    Notes:
+        - 24000 is the audio sample rate.
+        - 480 is the downsample ratio to align with VAE.
+        - Each onset interval "start-end" (in seconds) is converted to embedding indices via (time * 24000 / 480).
+    """
+    line_onset_index = torch.zeros((4, latent_length)) # max for 4 events
+    line_event = []
+    event_idx = 0
+    for event_onset in line_onset_str.split('--'):
+        #print(event_onset)
+        (event, instance) = event_onset.split('__')
+        #print(instance)
+        line_event.append(event)
+        for start_end in instance.split('_'):
+            (start, end) = start_end.split('-')         
+            start, end = int(float(start)*24000/480), int(float(end)*24000/480)
+            if end > (latent_length - 1): break
+            line_onset_index[event_idx, start: end] = 1
+        event_idx = event_idx + 1
+    return line_onset_index, line_event
+    
+
+class ContentEncoder(nn.Module):
+    """
+    ContentEncoder encodes TCC and TDC information.
+    """
+    def __init__(
+        self,
+        text_encoder: nn.Module= None,
+    ):
+        super().__init__()
+        self.text_encoder = text_encoder
+        self.pool = nn.AdaptiveAvgPool1d(1)
+
+    def encode_content(
+        self, batch_content: list[Any], device: str | torch.device
+    ):
+        batch_output = []
+        batch_mask = []
+        batch_onset = []
+        length_list = []
+        print(batch_content)
+        for content in batch_content:
+
+            caption = content["caption"]
+            onset = content["onset"]
+            length = int(float(content["length"]) *24000/480)
+                # Replacement for AudioTime
+            print(onset)
+            replace_label = content.get("replace_label", "False")
+            if replace_label == "True":
+                caption, onset = replace_event_synonyms(caption, onset)
+                
+            # Handle random onset case for read data without timestamp
+            if content["onset"] == "random":
+                length_list.append(length)
+                """
+                fixed embedding. Actually it's a sick sentence, a error during training, kept to match the checkpoint.
+                You can change it to sentence that difference to captions in datasets. 
+                The use of fixed text to obtain encoding is for numerical stability. 
+                We attempted to use learnable unified encoding during training, but the results were not satisfactory.
+                """
+                event = "There is no event here" 
+                event_embed = self.text_encoder([event.replace("_", " ")])["output"]
+                event_embed = self.pool(event_embed.permute(0, 2, 1))  # (B, 1024, 1)
+                event_embed = event_embed.flatten().unsqueeze(0)
+                new_onset = event_embed.repeat(length, 1).T
+            else:
+                onset_matrix, events = decode_data(onset, length)
+                length_list.append(length)
+                new_onset = torch.zeros((1024, length), device=device) # 1024 for T5
+                # TDC
+                for (idx, event) in enumerate(events):
+                    with torch.no_grad():
+                        event_embed = self.text_encoder([event.replace("_", " ")])["output"]
+                    event_embed = self.pool(event_embed.permute(0, 2, 1))  # (B, 1024, 1)
+                    event_embed = event_embed.flatten().unsqueeze(0)
+                    mask = (onset_matrix[idx, :] == 0)
+                    cols = mask.nonzero(as_tuple=True)[0]
+                    new_onset[:, cols] += event_embed.T.float()
+            # TCC
+            output_dict = self.text_encoder([caption])
+            batch_output.append(output_dict["output"][0])
+            batch_mask.append(output_dict["mask"][0])
+            batch_onset.append(new_onset)
+            
+        # Pad all sequences in the batch to the same length for batching
+        batch_output = nn.utils.rnn.pad_sequence(
+            batch_output, batch_first=True, padding_value=0
+        )
+        batch_mask = nn.utils.rnn.pad_sequence(
+            batch_mask, batch_first=True, padding_value=False
+        )
+        batch_onset = nn.utils.rnn.pad_sequence(
+            batch_onset, batch_first=True, padding_value=0
+        )
+        return batch_output, batch_mask, batch_onset, length_list
diff --git a/models/content_encoder/text_encoder.py b/models/content_encoder/text_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..c830be27b0aad930945d8578b047bcf6618d07ff
--- /dev/null
+++ b/models/content_encoder/text_encoder.py
@@ -0,0 +1,76 @@
+import torch
+import torch.nn as nn
+from transformers import AutoTokenizer, AutoModel, T5Tokenizer, T5EncoderModel
+from transformers.modeling_outputs import BaseModelOutput
+
+try:
+    import torch_npu
+    from torch_npu.contrib import transfer_to_npu
+    DEVICE_TYPE = "npu"
+except ModuleNotFoundError:
+    DEVICE_TYPE = "cuda"
+
+
+class TransformersTextEncoderBase(nn.Module):
+    """
+    Base class for text encoding using HuggingFace Transformers models.
+
+    """
+    def __init__(self, model_name: str):
+        super().__init__()
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.model = AutoModel.from_pretrained(model_name)
+
+    def forward(
+        self,
+        text: list[str],
+    ):
+        device = self.model.device
+        batch = self.tokenizer(
+            text,
+            max_length=self.tokenizer.model_max_length,
+            padding=True,
+            truncation=True,
+            return_tensors="pt"
+        )
+        input_ids = batch.input_ids.to(device)
+        attention_mask = batch.attention_mask.to(device)
+        output: BaseModelOutput = self.model(
+            input_ids=input_ids, attention_mask=attention_mask
+        )
+        output = output.last_hidden_state
+        mask = (attention_mask == 1).to(device)
+
+        return {"output": output, "mask": mask}
+
+
+class T5TextEncoder(TransformersTextEncoderBase):
+    """
+    Text encoder using T5 encoder model.
+    """
+    def __init__(self, model_name: str = "/mnt/petrelfs/zhengzihao/cache/google-flan-t5-large"):
+        nn.Module.__init__(self)
+        self.tokenizer = T5Tokenizer.from_pretrained(model_name)
+        self.model = T5EncoderModel.from_pretrained(model_name)
+        for param in self.model.parameters():
+            param.requires_grad = False
+        self.eval()
+
+    def forward(
+        self,
+        text: list[str],
+    ):
+        with torch.no_grad(), torch.amp.autocast(
+            device_type=DEVICE_TYPE, enabled=False
+        ):
+            return super().forward(text)
+
+
+if __name__ == '__main__':
+    text_encoder = T5TextEncoder()
+    text = ["dog barking and cat moving"]
+    text_encoder.eval()
+    with torch.no_grad():
+        output = text_encoder(text)
+    print(output["output"].shape)
+    #print(output)
\ No newline at end of file
diff --git a/models/diffusion.py b/models/diffusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4363261f44eea0807991c494a6e1af161a8b380
--- /dev/null
+++ b/models/diffusion.py
@@ -0,0 +1,398 @@
+from typing import Sequence
+import random
+from typing import Any
+
+from tqdm import tqdm
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import diffusers.schedulers as noise_schedulers
+from diffusers.schedulers.scheduling_utils import SchedulerMixin
+from diffusers.utils.torch_utils import randn_tensor
+
+import numpy as np
+from models.autoencoder.autoencoder_base import AutoEncoderBase
+from models.content_encoder.caption_encoder import ContentEncoder
+from models.common import LoadPretrainedBase, CountParamsBase, SaveTrainableParamsBase
+from utils.torch_utilities import (
+    create_alignment_path, create_mask_from_length, loss_with_mask,
+    trim_or_pad_length
+)
+
+
+class DiffusionMixin:
+    def __init__(
+        self,
+        noise_scheduler_name: str = "stabilityai/stable-diffusion-2-1",
+        snr_gamma: float = None,
+        classifier_free_guidance: bool = True,
+        cfg_drop_ratio: float = 0.2,
+
+    ) -> None:
+        self.noise_scheduler_name = noise_scheduler_name
+        self.snr_gamma = snr_gamma
+        self.classifier_free_guidance = classifier_free_guidance
+        self.cfg_drop_ratio = cfg_drop_ratio
+        self.noise_scheduler = noise_schedulers.DDIMScheduler.from_pretrained(
+            self.noise_scheduler_name, subfolder="scheduler"
+        )
+
+    def compute_snr(self, timesteps) -> torch.Tensor:
+        """
+        Computes SNR as per https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/blob/521b624bd70c67cee4bdf49225915f5945a872e3/guided_diffusion/gaussian_diffusion.py#L847-L849
+        """
+        alphas_cumprod = self.noise_scheduler.alphas_cumprod
+        sqrt_alphas_cumprod = alphas_cumprod**0.5
+        sqrt_one_minus_alphas_cumprod = (1.0 - alphas_cumprod)**0.5
+
+        # Expand the tensors.
+        # Adapted from https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/blob/521b624bd70c67cee4bdf49225915f5945a872e3/guided_diffusion/gaussian_diffusion.py#L1026
+        sqrt_alphas_cumprod = sqrt_alphas_cumprod.to(device=timesteps.device
+                                                    )[timesteps].float()
+        while len(sqrt_alphas_cumprod.shape) < len(timesteps.shape):
+            sqrt_alphas_cumprod = sqrt_alphas_cumprod[..., None]
+        alpha = sqrt_alphas_cumprod.expand(timesteps.shape)
+
+        sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod.to(
+            device=timesteps.device
+        )[timesteps].float()
+        while len(sqrt_one_minus_alphas_cumprod.shape) < len(timesteps.shape):
+            sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod[...,
+                                                                          None]
+        sigma = sqrt_one_minus_alphas_cumprod.expand(timesteps.shape)
+
+        # Compute SNR.
+        snr = (alpha / sigma)**2
+        return snr
+
+    def get_timesteps(
+        self,
+        batch_size: int,
+        device: torch.device,
+        training: bool = True
+    ) -> torch.Tensor:
+        if training:
+            timesteps = torch.randint(
+                0,
+                self.noise_scheduler.config.num_train_timesteps,
+                (batch_size, ),
+                device=device
+            )
+        else:
+            # validation on half of the total timesteps
+            timesteps = (self.noise_scheduler.config.num_train_timesteps //
+                         2) * torch.ones((batch_size, ),
+                                         dtype=torch.int64,
+                                         device=device)
+
+        timesteps = timesteps.long()
+        return timesteps
+
+    def get_target(
+        self, latent: torch.Tensor, noise: torch.Tensor,
+        timesteps: torch.Tensor
+    ) -> torch.Tensor:
+        """
+        Get the target for loss depending on the prediction type
+        """
+        if self.noise_scheduler.config.prediction_type == "epsilon":
+            target = noise
+        elif self.noise_scheduler.config.prediction_type == "v_prediction":
+            target = self.noise_scheduler.get_velocity(
+                latent, noise, timesteps
+            )
+        else:
+            raise ValueError(
+                f"Unknown prediction type {self.noise_scheduler.config.prediction_type}"
+            )
+        return target
+
+    def loss_with_snr(
+        self, pred: torch.Tensor, target: torch.Tensor,
+        timesteps: torch.Tensor, mask: torch.Tensor
+    ) -> torch.Tensor:
+        if self.snr_gamma is None:
+            loss = F.mse_loss(pred.float(), target.float(), reduction="none")
+            loss = loss_with_mask(loss, mask)
+        else:
+            # Compute loss-weights as per Section 3.4 of https://arxiv.org/abs/2303.09556.
+            # Adaptef from huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image.py
+            snr = self.compute_snr(timesteps)
+            mse_loss_weights = (
+                torch.stack([snr, self.snr_gamma * torch.ones_like(timesteps)],
+                            dim=1).min(dim=1)[0] / snr
+            )
+            loss = F.mse_loss(pred.float(), target.float(), reduction="none")
+            loss = loss_with_mask(loss, mask, reduce=False) * mse_loss_weights
+            loss = loss.mean()
+        return loss
+
+
+class AudioDiffusion(
+    LoadPretrainedBase, CountParamsBase, SaveTrainableParamsBase,
+    DiffusionMixin
+):  
+    """
+    Args:
+        autoencoder (AutoEncoderBase): Pretrained autoencoder module VAE(frozen).
+        content_encoder (ContentEncoder): Encodes TCC and TDC information.
+        backbone (nn.Module): Main denoising network.
+        frame_resolution (float): Resolution for audio frames.
+        noise_scheduler_name (str): Noise scheduler identifier.
+        snr_gamma (float, optional): SNR gamma for noise scheduler.
+        classifier_free_guidance (bool): Enable classifier-free guidance.
+        cfg_drop_ratio (float): Ratio for randomly dropping context for classifier-free guidance.
+    """
+    def __init__(
+        self,
+        autoencoder: AutoEncoderBase,
+        content_encoder: ContentEncoder,
+        backbone: nn.Module,
+        frame_resolution:float,
+        noise_scheduler_name: str = "stabilityai/stable-diffusion-2-1",
+        snr_gamma: float = None,
+        classifier_free_guidance: bool = True,
+        cfg_drop_ratio: float = 0.2,
+    ):
+        nn.Module.__init__(self)
+        DiffusionMixin.__init__(
+            self, noise_scheduler_name, snr_gamma, classifier_free_guidance, cfg_drop_ratio
+        )
+        
+        self.autoencoder = autoencoder
+        # Freeze autoencoder parameters
+        for param in self.autoencoder.parameters():
+            param.requires_grad = False
+
+        self.content_encoder = content_encoder
+        self.backbone = backbone
+        self.frame_resolution = frame_resolution
+        self.dummy_param = nn.Parameter(torch.empty(0))
+
+    def forward(
+        self, content: list[Any], condition: list[Any], task: list[str],
+        waveform: torch.Tensor, waveform_lengths: torch.Tensor, **kwargs
+    ):  
+        """
+        Training forward pass.
+
+        Args:
+            content (list[Any]): List of content dicts for each sample.
+            condition (list[Any]): Conditioning information (unused here).
+            task (list[str]): List of task types.
+            waveform (Tensor): Batch of waveform tensors.
+            waveform_lengths (Tensor): Lengths for each waveform sample.
+
+        Returns:
+            dict: Dictionary containing the diffusion loss.
+        """
+        device = self.dummy_param.device
+        num_train_timesteps = self.noise_scheduler.config.num_train_timesteps
+        self.noise_scheduler.set_timesteps(num_train_timesteps, device=device)
+
+        self.autoencoder.eval()
+        with torch.no_grad():
+            latent, latent_mask = self.autoencoder.encode(
+                waveform.unsqueeze(1), waveform_lengths
+            )
+        # content(non_time_aligned_content) for TCC and time_aligned_content for TDC
+        content, content_mask, onset, _= self.content_encoder.encode_content(
+            content, device=device
+        )
+
+        # prepare latent and diffusion-related noise
+        time_aligned_content = onset.permute(0,2,1)
+        if self.training and self.classifier_free_guidance:
+            mask_indices = [
+                k for k in range(len(waveform)) if random.random() < self.cfg_drop_ratio
+            ]
+            if len(mask_indices) > 0:
+                content[mask_indices] = 0
+                time_aligned_content[mask_indices] = 0
+
+        batch_size = latent.shape[0]
+        timesteps = self.get_timesteps(batch_size, device, self.training)
+        noise = torch.randn_like(latent)
+        noisy_latent = self.noise_scheduler.add_noise(latent, noise, timesteps)
+        target = self.get_target(latent, noise, timesteps)
+
+        # Denoising prediction
+        pred: torch.Tensor = self.backbone(
+            x=noisy_latent,
+            timesteps=timesteps,
+            time_aligned_context=time_aligned_content,
+            context=content,
+            x_mask=latent_mask,
+            context_mask=content_mask
+        )
+        pred = pred.transpose(1, self.autoencoder.time_dim)
+        target = target.transpose(1, self.autoencoder.time_dim)
+        diff_loss = self.loss_with_snr(pred, target, timesteps, latent_mask)
+        return {
+            "diff_loss": diff_loss,
+        }
+
+    @torch.no_grad()
+    def inference(
+        self,
+        content: list[Any],
+        num_steps: int = 20,
+        guidance_scale: float = 3.0,
+        guidance_rescale: float = 0.0,
+        disable_progress: bool = True,
+        num_samples_per_content: int = 1,
+        **kwargs
+    ):
+        """
+        Inference/generation method for audio diffusion.
+
+        Args:
+            content (list[Any]): List of content dicts.
+            scheduler (SchedulerMixin): Scheduler for timesteps and noise.
+            num_steps (int): Number of denoising steps.
+            guidance_scale (float): Classifier-free guidance scale.
+            guidance_rescale (float): Rescale factor for guidance.
+            disable_progress (bool): Disable progress bar.
+            num_samples_per_content (int): How many samples to generate per content.
+
+        Returns:
+            waveform (Tensor): Generated waveform.
+        """
+        device = self.dummy_param.device
+        classifier_free_guidance = guidance_scale > 1.0
+        batch_size = len(content) * num_samples_per_content
+        print(content)
+        if classifier_free_guidance:
+            content, content_mask, onset, length_list = self.encode_content_classifier_free(
+                content, num_samples_per_content
+            )
+        else:
+            content, content_mask, onset, length_list = self.content_encoder.encode_content(
+            content, device=device
+        )
+            content = content.repeat_interleave(num_samples_per_content, 0)
+            content_mask = content_mask.repeat_interleave(
+                num_samples_per_content, 0
+            )
+
+        self.noise_scheduler.set_timesteps(num_steps, device=device)
+        timesteps = self.noise_scheduler.timesteps
+
+
+        # prepare input latent and context for the backbone
+        shape = (batch_size, 128, onset.shape[2])  # 128 for StableVAE channels
+        time_aligned_content = onset.permute(0,2,1)
+        latent = randn_tensor(
+            shape, generator=None, device=device, dtype=content.dtype
+        )
+        
+        # scale the initial noise by the standard deviation required by the scheduler
+        latent = latent * self.noise_scheduler.init_noise_sigma
+        latent_mask = torch.full((batch_size, onset.shape[2]), False, device=device)
+        
+        for i, length in enumerate(length_list):
+        # Set latent mask True for valid time steps for each sample
+            latent_mask[i, :length] = True
+        num_warmup_steps = len(timesteps) - num_steps * self.noise_scheduler.order
+        progress_bar = tqdm(range(num_steps), disable=disable_progress)
+
+        if classifier_free_guidance:
+            uncond_time_aligned_content = torch.zeros_like(
+                time_aligned_content
+            )
+            time_aligned_content = torch.cat(
+                [uncond_time_aligned_content, time_aligned_content]
+            )
+            latent_mask = torch.cat(
+                [latent_mask, latent_mask.detach().clone()]
+            )
+
+        # iteratively denoising
+
+        for i, timestep in enumerate(timesteps):
+
+            latent_input = torch.cat(
+                [latent, latent]
+            ) if classifier_free_guidance else latent
+            latent_input = self.noise_scheduler.scale_model_input(latent_input, timestep)
+
+            noise_pred = self.backbone(
+                x=latent_input,
+                x_mask=latent_mask,
+                timesteps=timestep,
+                time_aligned_context=time_aligned_content,
+                context=content,
+                context_mask=content_mask,
+            )
+
+            if classifier_free_guidance:
+                noise_pred_uncond, noise_pred_content = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (
+                    noise_pred_content - noise_pred_uncond
+                )
+                if guidance_rescale != 0.0:
+                    noise_pred = self.rescale_cfg(
+                        noise_pred_content, noise_pred, guidance_rescale
+                    )
+            # compute the previous noisy sample x_t -> x_t-1
+            latent = self.noise_scheduler.step(noise_pred, timestep, latent).prev_sample
+            
+            # call the callback, if provided
+            if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and
+                                           (i+1) % self.noise_scheduler.order == 0):
+                progress_bar.update(1)
+        #latent = latent.to(next(self.autoencoder.parameters()).device)
+        waveform = self.autoencoder.decode(latent)
+        return waveform
+
+    def encode_content_classifier_free(
+        self,
+        content: list[Any],
+        task: list[str],
+        num_samples_per_content: int = 1
+    ):
+        device = self.dummy_param.device
+
+        content, content_mask, onset, length_list = self.content_encoder.encode_content(
+            content, device=device
+        )
+        content = content.repeat_interleave(num_samples_per_content, 0)
+        content_mask = content_mask.repeat_interleave(
+            num_samples_per_content, 0
+        )
+
+        # get unconditional embeddings for classifier free guidance
+        uncond_content = torch.zeros_like(content)
+        uncond_content_mask = content_mask.detach().clone()
+
+        uncond_content = uncond_content.repeat_interleave(
+            num_samples_per_content, 0
+        )
+        uncond_content_mask = uncond_content_mask.repeat_interleave(
+            num_samples_per_content, 0
+        )
+
+        # For classifier free guidance, we need to do two forward passes.
+        # We concatenate the unconditional and text embeddings into a single batch to avoid doing two forward passes
+        content = torch.cat([uncond_content, content])
+        content_mask = torch.cat([uncond_content_mask, content_mask])
+
+        return content, content_mask, onset, length_list
+    
+    def rescale_cfg(
+        self, pred_cond: torch.Tensor, pred_cfg: torch.Tensor,
+        guidance_rescale: float
+    ):
+        """
+        Rescale `pred_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+        Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+        """
+        std_cond = pred_cond.std(
+            dim=list(range(1, pred_cond.ndim)), keepdim=True
+        )
+        std_cfg = pred_cfg.std(dim=list(range(1, pred_cfg.ndim)), keepdim=True)
+
+        pred_rescaled = pred_cfg * (std_cond / std_cfg)
+        pred_cfg = guidance_rescale * pred_rescaled + (
+            1 - guidance_rescale
+        ) * pred_cfg
diff --git a/models/dit/__pycache__/attention.cpython-310.pyc b/models/dit/__pycache__/attention.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2b07aff7cc03ba914ae53b19b0806f684b364ca0
Binary files /dev/null and b/models/dit/__pycache__/attention.cpython-310.pyc differ
diff --git a/models/dit/__pycache__/audio_dit.cpython-310.pyc b/models/dit/__pycache__/audio_dit.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dd57841061f81c6d99ffa668b6c2f6b84a7be942
Binary files /dev/null and b/models/dit/__pycache__/audio_dit.cpython-310.pyc differ
diff --git a/models/dit/__pycache__/mask_dit.cpython-310.pyc b/models/dit/__pycache__/mask_dit.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d522a6de45cf7a054a70df04fa1a653d29b8067d
Binary files /dev/null and b/models/dit/__pycache__/mask_dit.cpython-310.pyc differ
diff --git a/models/dit/__pycache__/modules.cpython-310.pyc b/models/dit/__pycache__/modules.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d3c5c37bc20a567a4591c402a33e25e7a439568e
Binary files /dev/null and b/models/dit/__pycache__/modules.cpython-310.pyc differ
diff --git a/models/dit/__pycache__/rotary.cpython-310.pyc b/models/dit/__pycache__/rotary.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ed908b11418e29881c19d938b50fdd353a56e18c
Binary files /dev/null and b/models/dit/__pycache__/rotary.cpython-310.pyc differ
diff --git a/models/dit/__pycache__/span_mask.cpython-310.pyc b/models/dit/__pycache__/span_mask.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..50398017bd403c537104d81ceac0cb16d3b61586
Binary files /dev/null and b/models/dit/__pycache__/span_mask.cpython-310.pyc differ
diff --git a/models/dit/attention.py b/models/dit/attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d6a665c87fc8b17a59b9d03cf325c273df50392
--- /dev/null
+++ b/models/dit/attention.py
@@ -0,0 +1,350 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint
+import einops
+from einops import rearrange, repeat
+from inspect import isfunction
+from .rotary import RotaryEmbedding
+from .modules import RMSNorm
+
+if hasattr(nn.functional, 'scaled_dot_product_attention'):
+    ATTENTION_MODE = 'flash'
+else:
+    ATTENTION_MODE = 'math'
+print(f'attention mode is {ATTENTION_MODE}')
+
+
+def add_mask(sim, mask):
+    b, ndim = sim.shape[0], mask.ndim
+    if ndim == 3:
+        mask = rearrange(mask, "b n m -> b 1 n m")
+    if ndim == 2:
+        mask = repeat(mask, "n m -> b 1 n m", b=b)
+    max_neg_value = -torch.finfo(sim.dtype).max
+    sim = sim.masked_fill(~mask, max_neg_value)
+    return sim
+
+
+def create_mask(q_shape, k_shape, device, q_mask=None, k_mask=None):
+    def default(val, d):
+        return val if val is not None else (d() if isfunction(d) else d)
+
+    b, i, j, device = q_shape[0], q_shape[-2], k_shape[-2], device
+    #print(q_mask)
+    q_mask = default(
+        q_mask, torch.ones((b, i), device=device, dtype=torch.bool)
+    )
+    k_mask = default(
+        k_mask, torch.ones((b, j), device=device, dtype=torch.bool)
+    )
+    attn_mask = rearrange(q_mask, 'b i -> b 1 i 1'
+                         ) * rearrange(k_mask, 'b j -> b 1 1 j')
+    return attn_mask
+
+
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim,
+        context_dim=None,
+        num_heads=8,
+        qkv_bias=False,
+        qk_scale=None,
+        qk_norm=None,
+        attn_drop=0.,
+        proj_drop=0.,
+        rope_mode='none'
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+
+        if context_dim is None:
+            self.cross_attn = False
+        else:
+            self.cross_attn = True
+
+        context_dim = dim if context_dim is None else context_dim
+
+        self.to_q = nn.Linear(dim, dim, bias=qkv_bias)
+        self.to_k = nn.Linear(context_dim, dim, bias=qkv_bias)
+        self.to_v = nn.Linear(context_dim, dim, bias=qkv_bias)
+
+        if qk_norm is None:
+            self.norm_q = nn.Identity()
+            self.norm_k = nn.Identity()
+        elif qk_norm == 'layernorm':
+            self.norm_q = nn.LayerNorm(head_dim)
+            self.norm_k = nn.LayerNorm(head_dim)
+        elif qk_norm == 'rmsnorm':
+            self.norm_q = RMSNorm(head_dim)
+            self.norm_k = RMSNorm(head_dim)
+        else:
+            raise NotImplementedError
+
+        self.attn_drop_p = attn_drop
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+        if self.cross_attn:
+            assert rope_mode == 'none'
+        self.rope_mode = rope_mode
+        if self.rope_mode == 'shared' or self.rope_mode == 'x_only':
+            self.rotary = RotaryEmbedding(dim=head_dim)
+        elif self.rope_mode == 'dual':
+            self.rotary_x = RotaryEmbedding(dim=head_dim)
+            self.rotary_c = RotaryEmbedding(dim=head_dim)
+
+    def _rotary(self, q, k, extras):
+        if self.rope_mode == 'shared':
+            q, k = self.rotary(q=q, k=k)
+        elif self.rope_mode == 'x_only':
+            q_x, k_x = self.rotary(
+                q=q[:, :, extras:, :], k=k[:, :, extras:, :]
+            )
+            q_c, k_c = q[:, :, :extras, :], k[:, :, :extras, :]
+            q = torch.cat((q_c, q_x), dim=2)
+            k = torch.cat((k_c, k_x), dim=2)
+        elif self.rope_mode == 'dual':
+            q_x, k_x = self.rotary_x(
+                q=q[:, :, extras:, :], k=k[:, :, extras:, :]
+            )
+            q_c, k_c = self.rotary_c(
+                q=q[:, :, :extras, :], k=k[:, :, :extras, :]
+            )
+            q = torch.cat((q_c, q_x), dim=2)
+            k = torch.cat((k_c, k_x), dim=2)
+        elif self.rope_mode == 'none':
+            pass
+        else:
+            raise NotImplementedError
+        return q, k
+
+    def _attn(self, q, k, v, mask_binary):
+        if ATTENTION_MODE == 'flash':
+            x = F.scaled_dot_product_attention(
+                q, k, v, dropout_p=self.attn_drop_p, attn_mask=mask_binary
+            )
+            x = einops.rearrange(x, 'B H L D -> B L (H D)')
+        elif ATTENTION_MODE == 'math':
+            attn = (q @ k.transpose(-2, -1)) * self.scale
+            attn = add_mask(
+                attn, mask_binary
+            ) if mask_binary is not None else attn
+            attn = attn.softmax(dim=-1)
+            attn = self.attn_drop(attn)
+            x = (attn @ v).transpose(1, 2)
+            x = einops.rearrange(x, 'B H L D -> B L (H D)')
+        else:
+            raise NotImplementedError
+        return x
+
+    def forward(self, x, context=None, context_mask=None, extras=0):
+        B, L, C = x.shape
+        if context is None:
+            context = x
+
+        q = self.to_q(x)
+        k = self.to_k(context)
+        v = self.to_v(context)
+
+        if context_mask is not None:
+            mask_binary = create_mask(
+                x.shape, context.shape, x.device, None, context_mask
+            )
+        else:
+            mask_binary = None
+
+        q = einops.rearrange(q, 'B L (H D) -> B H L D', H=self.num_heads)
+        k = einops.rearrange(k, 'B L (H D) -> B H L D', H=self.num_heads)
+        v = einops.rearrange(v, 'B L (H D) -> B H L D', H=self.num_heads)
+
+        q = self.norm_q(q)
+        k = self.norm_k(k)
+
+        q, k = self._rotary(q, k, extras)
+
+        x = self._attn(q, k, v, mask_binary)
+
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class JointAttention(nn.Module):
+    def __init__(
+        self,
+        dim,
+        num_heads=8,
+        qkv_bias=False,
+        qk_scale=None,
+        qk_norm=None,
+        attn_drop=0.,
+        proj_drop=0.,
+        rope_mode='none'
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+
+        self.to_qx, self.to_kx, self.to_vx = self._make_qkv_layers(
+            dim, qkv_bias
+        )
+        self.to_qc, self.to_kc, self.to_vc = self._make_qkv_layers(
+            dim, qkv_bias
+        )
+
+        self.norm_qx, self.norm_kx = self._make_norm_layers(qk_norm, head_dim)
+        self.norm_qc, self.norm_kc = self._make_norm_layers(qk_norm, head_dim)
+
+        self.attn_drop_p = attn_drop
+        self.attn_drop = nn.Dropout(attn_drop)
+
+        self.proj_x = nn.Linear(dim, dim)
+        self.proj_drop_x = nn.Dropout(proj_drop)
+
+        self.proj_c = nn.Linear(dim, dim)
+        self.proj_drop_c = nn.Dropout(proj_drop)
+
+        self.rope_mode = rope_mode
+        if self.rope_mode == 'shared' or self.rope_mode == 'x_only':
+            self.rotary = RotaryEmbedding(dim=head_dim)
+        elif self.rope_mode == 'dual':
+            self.rotary_x = RotaryEmbedding(dim=head_dim)
+            self.rotary_c = RotaryEmbedding(dim=head_dim)
+
+    def _make_qkv_layers(self, dim, qkv_bias):
+        return (
+            nn.Linear(dim, dim,
+                      bias=qkv_bias), nn.Linear(dim, dim, bias=qkv_bias),
+            nn.Linear(dim, dim, bias=qkv_bias)
+        )
+
+    def _make_norm_layers(self, qk_norm, head_dim):
+        if qk_norm is None:
+            norm_q = nn.Identity()
+            norm_k = nn.Identity()
+        elif qk_norm == 'layernorm':
+            norm_q = nn.LayerNorm(head_dim)
+            norm_k = nn.LayerNorm(head_dim)
+        elif qk_norm == 'rmsnorm':
+            norm_q = RMSNorm(head_dim)
+            norm_k = RMSNorm(head_dim)
+        else:
+            raise NotImplementedError
+        return norm_q, norm_k
+
+    def _rotary(self, q, k, extras):
+        if self.rope_mode == 'shared':
+            q, k = self.rotary(q=q, k=k)
+        elif self.rope_mode == 'x_only':
+            q_x, k_x = self.rotary(
+                q=q[:, :, extras:, :], k=k[:, :, extras:, :]
+            )
+            q_c, k_c = q[:, :, :extras, :], k[:, :, :extras, :]
+            q = torch.cat((q_c, q_x), dim=2)
+            k = torch.cat((k_c, k_x), dim=2)
+        elif self.rope_mode == 'dual':
+            q_x, k_x = self.rotary_x(
+                q=q[:, :, extras:, :], k=k[:, :, extras:, :]
+            )
+            q_c, k_c = self.rotary_c(
+                q=q[:, :, :extras, :], k=k[:, :, :extras, :]
+            )
+            q = torch.cat((q_c, q_x), dim=2)
+            k = torch.cat((k_c, k_x), dim=2)
+        elif self.rope_mode == 'none':
+            pass
+        else:
+            raise NotImplementedError
+        return q, k
+
+    def _attn(self, q, k, v, mask_binary):
+        if ATTENTION_MODE == 'flash':
+            x = F.scaled_dot_product_attention(
+                q, k, v, dropout_p=self.attn_drop_p, attn_mask=mask_binary
+            )
+            x = einops.rearrange(x, 'B H L D -> B L (H D)')
+        elif ATTENTION_MODE == 'math':
+            attn = (q @ k.transpose(-2, -1)) * self.scale
+            attn = add_mask(
+                attn, mask_binary
+            ) if mask_binary is not None else attn
+            attn = attn.softmax(dim=-1)
+            attn = self.attn_drop(attn)
+            x = (attn @ v).transpose(1, 2)
+            x = einops.rearrange(x, 'B H L D -> B L (H D)')
+        else:
+            raise NotImplementedError
+        return x
+
+    def _cat_mask(self, x, context, x_mask=None, context_mask=None):
+        B = x.shape[0]
+        if x_mask is None:
+            x_mask = torch.ones(B, x.shape[-2], device=x.device).bool()
+        if context_mask is None:
+            context_mask = torch.ones(
+                B, context.shape[-2], device=context.device
+            ).bool()
+        mask = torch.cat([context_mask, x_mask], dim=1)
+        return mask
+
+    def forward(self, x, context, x_mask=None, context_mask=None, extras=0):
+        B, Lx, C = x.shape
+        _, Lc, _ = context.shape
+        if x_mask is not None or context_mask is not None:
+            mask = self._cat_mask(
+                x, context, x_mask=x_mask, context_mask=context_mask
+            )
+            shape = [B, Lx + Lc, C]
+            mask_binary = create_mask(
+                q_shape=shape,
+                k_shape=shape,
+                device=x.device,
+                q_mask=None,
+                k_mask=mask
+            )
+        else:
+            mask_binary = None
+
+        qx, kx, vx = self.to_qx(x), self.to_kx(x), self.to_vx(x)
+        qc, kc, vc = self.to_qc(context), self.to_kc(context
+                                                    ), self.to_vc(context)
+
+        qx, kx, vx = map(
+            lambda t: einops.
+            rearrange(t, 'B L (H D) -> B H L D', H=self.num_heads),
+            [qx, kx, vx]
+        )
+        qc, kc, vc = map(
+            lambda t: einops.
+            rearrange(t, 'B L (H D) -> B H L D', H=self.num_heads),
+            [qc, kc, vc]
+        )
+
+        qx, kx = self.norm_qx(qx), self.norm_kx(kx)
+        qc, kc = self.norm_qc(qc), self.norm_kc(kc)
+
+        q, k, v = (
+            torch.cat([qc, qx],
+                      dim=2), torch.cat([kc, kx],
+                                        dim=2), torch.cat([vc, vx], dim=2)
+        )
+
+        q, k = self._rotary(q, k, extras)
+
+        x = self._attn(q, k, v, mask_binary)
+
+        context, x = x[:, :Lc, :], x[:, Lc:, :]
+
+        x = self.proj_x(x)
+        x = self.proj_drop_x(x)
+
+        context = self.proj_c(context)
+        context = self.proj_drop_c(context)
+
+        return x, context
diff --git a/models/dit/audio_diffsingernet_dit.py b/models/dit/audio_diffsingernet_dit.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a5facb5c2316a04f0010a477ce0b6d7268d043a
--- /dev/null
+++ b/models/dit/audio_diffsingernet_dit.py
@@ -0,0 +1,520 @@
+import torch
+import torch.nn as nn
+from torch.utils.checkpoint import checkpoint
+
+from .mask_dit import DiTBlock, FinalBlock, UDiT
+from .modules import (
+    film_modulate,
+    PatchEmbed,
+    PE_wrapper,
+    TimestepEmbedder,
+    RMSNorm,
+)
+
+
+class AudioDiTBlock(DiTBlock):
+    """
+    A modified DiT block with time_aligned_context add to latent.
+    """
+    def __init__(
+        self,
+        dim,
+        time_aligned_context_dim,
+        dilation,
+        context_dim=None,
+        num_heads=8,
+        mlp_ratio=4.,
+        qkv_bias=False,
+        qk_scale=None,
+        qk_norm=None,
+        act_layer='gelu',
+        norm_layer=nn.LayerNorm,
+        time_fusion='none',
+        ada_sola_rank=None,
+        ada_sola_alpha=None,
+        skip=False,
+        skip_norm=False,
+        rope_mode='none',
+        context_norm=False,
+        use_checkpoint=False
+    ):
+        super().__init__(
+            dim=dim,
+            context_dim=context_dim,
+            num_heads=num_heads,
+            mlp_ratio=mlp_ratio,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            qk_norm=qk_norm,
+            act_layer=act_layer,
+            norm_layer=norm_layer,
+            time_fusion=time_fusion,
+            ada_sola_rank=ada_sola_rank,
+            ada_sola_alpha=ada_sola_alpha,
+            skip=skip,
+            skip_norm=skip_norm,
+            rope_mode=rope_mode,
+            context_norm=context_norm,
+            use_checkpoint=use_checkpoint
+        )
+        # time-aligned context projection
+        self.ta_context_projection = nn.Linear(
+            time_aligned_context_dim, 2 * dim
+        )
+        self.dilated_conv = nn.Conv1d(
+            dim, 2 * dim, kernel_size=3, padding=dilation, dilation=dilation
+        )
+
+    def forward(
+        self,
+        x,
+        time_aligned_context,
+        time_token=None,
+        time_ada=None,
+        skip=None,
+        context=None,
+        x_mask=None,
+        context_mask=None,
+        extras=None
+    ):
+        if self.use_checkpoint:
+            return checkpoint(
+                self._forward,
+                x,
+                time_aligned_context,
+                time_token,
+                time_ada,
+                skip,
+                context,
+                x_mask,
+                context_mask,
+                extras,
+                use_reentrant=False
+            )
+        else:
+            return self._forward(
+                x,
+                time_aligned_context,
+                time_token,
+                time_ada,
+                skip,
+                context,
+                x_mask,
+                context_mask,
+                extras,
+            )
+
+    def _forward(
+        self,
+        x,
+        time_aligned_context,
+        time_token=None,
+        time_ada=None,
+        skip=None,
+        context=None,
+        x_mask=None,
+        context_mask=None,
+        extras=None
+    ):
+        B, T, C = x.shape
+        if self.skip_linear is not None:
+            assert skip is not None
+            cat = torch.cat([x, skip], dim=-1)
+            cat = self.skip_norm(cat)
+            x = self.skip_linear(cat)
+
+        if self.use_adanorm:
+            time_ada = self.adaln(time_token, time_ada)
+            (shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp,
+             gate_mlp) = time_ada.chunk(6, dim=1)
+
+        # self attention
+        if self.use_adanorm:
+            x_norm = film_modulate(
+                self.norm1(x), shift=shift_msa, scale=scale_msa
+            )
+            x = x + (1-gate_msa) * self.attn(
+                x_norm, context=None, context_mask=x_mask, extras=extras
+            )
+        else:
+            # TODO diffusion timestep input is not fused here
+            x = x + self.attn(
+                self.norm1(x),
+                context=None,
+                context_mask=x_mask,
+                extras=extras
+            )
+
+        # time-aligned context
+        time_aligned_context = self.ta_context_projection(time_aligned_context)
+        x = self.dilated_conv(x.transpose(1, 2)
+                             ).transpose(1, 2) + time_aligned_context
+
+        gate, filter = torch.chunk(x, 2, dim=-1)
+        x = torch.sigmoid(gate) * torch.tanh(filter)
+
+        # cross attention
+        if self.use_context:
+            assert context is not None
+            x = x + self.cross_attn(
+                x=self.norm2(x),
+                context=self.norm_context(context),
+                context_mask=context_mask,
+                extras=extras
+            )
+
+        # mlp
+        if self.use_adanorm:
+            x_norm = film_modulate(
+                self.norm3(x), shift=shift_mlp, scale=scale_mlp
+            )
+            x = x + (1-gate_mlp) * self.mlp(x_norm)
+        else:
+            x = x + self.mlp(self.norm3(x))
+
+        return x
+
+
+class AudioUDiT(UDiT):
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=16,
+        in_chans=3,
+        input_type='2d',
+        out_chans=None,
+        embed_dim=768,
+        depth=12,
+        dilation_cycle_length=4,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=False,
+        qk_scale=None,
+        qk_norm=None,
+        act_layer='gelu',
+        norm_layer='layernorm',
+        context_norm=False,
+        use_checkpoint=False,
+        time_fusion='token',
+        ada_sola_rank=None,
+        ada_sola_alpha=None,
+        cls_dim=None,
+        time_aligned_context_dim=768,
+        context_dim=768,
+        context_fusion='concat',
+        context_max_length=128,
+        context_pe_method='sinu',
+        pe_method='abs',
+        rope_mode='none',
+        use_conv=True,
+        skip=True,
+        skip_norm=True
+    ):
+        nn.Module.__init__(self)
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+
+        # input
+        self.in_chans = in_chans
+        self.input_type = input_type
+        if self.input_type == '2d':
+            num_patches = (img_size[0] //
+                           patch_size) * (img_size[1] // patch_size)
+        elif self.input_type == '1d':
+            num_patches = img_size // patch_size
+        self.patch_embed = PatchEmbed(
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            input_type=input_type
+        )
+        out_chans = in_chans if out_chans is None else out_chans
+        self.out_chans = out_chans
+
+        # position embedding
+        self.rope = rope_mode
+        self.x_pe = PE_wrapper(
+            dim=embed_dim, method=pe_method, length=num_patches
+        )
+
+        # time embed
+        self.time_embed = TimestepEmbedder(embed_dim)
+        self.time_fusion = time_fusion
+        self.use_adanorm = False
+
+        # cls embed
+        if cls_dim is not None:
+            self.cls_embed = nn.Sequential(
+                nn.Linear(cls_dim, embed_dim, bias=True),
+                nn.SiLU(),
+                nn.Linear(embed_dim, embed_dim, bias=True),
+            )
+        else:
+            self.cls_embed = None
+
+        # time fusion
+        if time_fusion == 'token':
+            # put token at the beginning of sequence
+            self.extras = 2 if self.cls_embed else 1
+            self.time_pe = PE_wrapper(
+                dim=embed_dim, method='abs', length=self.extras
+            )
+        elif time_fusion in ['ada', 'ada_single', 'ada_sola', 'ada_sola_bias']:
+            self.use_adanorm = True
+            # aviod  repetitive silu for each adaln block
+            self.time_act = nn.SiLU()
+            self.extras = 0
+            self.time_ada_final = nn.Linear(
+                embed_dim, 2 * embed_dim, bias=True
+            )
+            if time_fusion in ['ada_single', 'ada_sola', 'ada_sola_bias']:
+                # shared adaln
+                self.time_ada = nn.Linear(embed_dim, 6 * embed_dim, bias=True)
+            else:
+                self.time_ada = None
+        else:
+            raise NotImplementedError
+
+        # context
+        # use a simple projection
+        self.use_context = False
+        self.context_cross = False
+        self.context_max_length = context_max_length
+        self.context_fusion = 'none'
+        if context_dim is not None:
+            self.use_context = True
+            self.context_embed = nn.Sequential(
+                nn.Linear(context_dim, embed_dim, bias=True),
+                nn.SiLU(),
+                nn.Linear(embed_dim, embed_dim, bias=True),
+            )
+            self.context_fusion = context_fusion
+            if context_fusion == 'concat' or context_fusion == 'joint':
+                self.extras += context_max_length
+                self.context_pe = PE_wrapper(
+                    dim=embed_dim,
+                    method=context_pe_method,
+                    length=context_max_length
+                )
+                # no cross attention layers
+                context_dim = None
+            elif context_fusion == 'cross':
+                self.context_pe = PE_wrapper(
+                    dim=embed_dim,
+                    method=context_pe_method,
+                    length=context_max_length
+                )
+                self.context_cross = True
+                context_dim = embed_dim
+            else:
+                raise NotImplementedError
+
+        self.use_skip = skip
+
+        # norm layers
+        if norm_layer == 'layernorm':
+            norm_layer = nn.LayerNorm
+        elif norm_layer == 'rmsnorm':
+            norm_layer = RMSNorm
+        else:
+            raise NotImplementedError
+
+        self.in_blocks = nn.ModuleList([
+            AudioDiTBlock(
+                dim=embed_dim,
+                time_aligned_context_dim=time_aligned_context_dim,
+                dilation=2**(i % dilation_cycle_length),
+                context_dim=context_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                qk_norm=qk_norm,
+                act_layer=act_layer,
+                norm_layer=norm_layer,
+                time_fusion=time_fusion,
+                ada_sola_rank=ada_sola_rank,
+                ada_sola_alpha=ada_sola_alpha,
+                skip=False,
+                skip_norm=False,
+                rope_mode=self.rope,
+                context_norm=context_norm,
+                use_checkpoint=use_checkpoint
+            ) for i in range(depth // 2)
+        ])
+
+        self.mid_block = AudioDiTBlock(
+            dim=embed_dim,
+            time_aligned_context_dim=time_aligned_context_dim,
+            dilation=1,
+            context_dim=context_dim,
+            num_heads=num_heads,
+            mlp_ratio=mlp_ratio,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            qk_norm=qk_norm,
+            act_layer=act_layer,
+            norm_layer=norm_layer,
+            time_fusion=time_fusion,
+            ada_sola_rank=ada_sola_rank,
+            ada_sola_alpha=ada_sola_alpha,
+            skip=False,
+            skip_norm=False,
+            rope_mode=self.rope,
+            context_norm=context_norm,
+            use_checkpoint=use_checkpoint
+        )
+
+        self.out_blocks = nn.ModuleList([
+            AudioDiTBlock(
+                dim=embed_dim,
+                time_aligned_context_dim=time_aligned_context_dim,
+                dilation=2**(i % dilation_cycle_length),
+                context_dim=context_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                qk_norm=qk_norm,
+                act_layer=act_layer,
+                norm_layer=norm_layer,
+                time_fusion=time_fusion,
+                ada_sola_rank=ada_sola_rank,
+                ada_sola_alpha=ada_sola_alpha,
+                skip=skip,
+                skip_norm=skip_norm,
+                rope_mode=self.rope,
+                context_norm=context_norm,
+                use_checkpoint=use_checkpoint
+            ) for i in range(depth // 2)
+        ])
+
+        # FinalLayer block
+        self.use_conv = use_conv
+        self.final_block = FinalBlock(
+            embed_dim=embed_dim,
+            patch_size=patch_size,
+            img_size=img_size,
+            in_chans=out_chans,
+            input_type=input_type,
+            norm_layer=norm_layer,
+            use_conv=use_conv,
+            use_adanorm=self.use_adanorm
+        )
+        self.initialize_weights()
+
+    def forward(
+        self,
+        x,
+        timesteps,
+        time_aligned_context,
+        context,
+        x_mask=None,
+        context_mask=None,
+        cls_token=None,
+        controlnet_skips=None,
+    ):
+        # make it compatible with int time step during inference
+        if timesteps.dim() == 0:
+            timesteps = timesteps.expand(x.shape[0]
+                                        ).to(x.device, dtype=torch.long)
+
+        x = self.patch_embed(x)
+        x = self.x_pe(x)
+
+        B, L, D = x.shape
+
+        if self.use_context:
+            context_token = self.context_embed(context)
+            context_token = self.context_pe(context_token)
+            if self.context_fusion == 'concat' or self.context_fusion == 'joint':
+                x, x_mask = self._concat_x_context(
+                    x=x,
+                    context=context_token,
+                    x_mask=x_mask,
+                    context_mask=context_mask
+                )
+                context_token, context_mask = None, None
+        else:
+            context_token, context_mask = None, None
+
+        time_token = self.time_embed(timesteps)
+        if self.cls_embed:
+            cls_token = self.cls_embed(cls_token)
+        time_ada = None
+        time_ada_final = None
+        if self.use_adanorm:
+            if self.cls_embed:
+                time_token = time_token + cls_token
+            time_token = self.time_act(time_token)
+            time_ada_final = self.time_ada_final(time_token)
+            if self.time_ada is not None:
+                time_ada = self.time_ada(time_token)
+        else:
+            time_token = time_token.unsqueeze(dim=1)
+            if self.cls_embed:
+                cls_token = cls_token.unsqueeze(dim=1)
+                time_token = torch.cat([time_token, cls_token], dim=1)
+            time_token = self.time_pe(time_token)
+            x = torch.cat((time_token, x), dim=1)
+            if x_mask is not None:
+                x_mask = torch.cat([
+                    torch.ones(B, time_token.shape[1],
+                               device=x_mask.device).bool(), x_mask
+                ],
+                                   dim=1)
+            time_token = None
+
+        skips = []
+        for blk in self.in_blocks:
+            x = blk(
+                x=x,
+                time_aligned_context=time_aligned_context,
+                time_token=time_token,
+                time_ada=time_ada,
+                skip=None,
+                context=context_token,
+                x_mask=x_mask,
+                context_mask=context_mask,
+                extras=self.extras
+            )
+            if self.use_skip:
+                skips.append(x)
+
+        x = self.mid_block(
+            x=x,
+            time_aligned_context=time_aligned_context,
+            time_token=time_token,
+            time_ada=time_ada,
+            skip=None,
+            context=context_token,
+            x_mask=x_mask,
+            context_mask=context_mask,
+            extras=self.extras
+        )
+        for blk in self.out_blocks:
+            if self.use_skip:
+                skip = skips.pop()
+                if controlnet_skips:
+                    # add to skip like u-net controlnet
+                    skip = skip + controlnet_skips.pop()
+            else:
+                skip = None
+                if controlnet_skips:
+                    # directly add to x
+                    x = x + controlnet_skips.pop()
+
+            x = blk(
+                x=x,
+                time_aligned_context=time_aligned_context,
+                time_token=time_token,
+                time_ada=time_ada,
+                skip=skip,
+                context=context_token,
+                x_mask=x_mask,
+                context_mask=context_mask,
+                extras=self.extras
+            )
+
+        x = self.final_block(x, time_ada=time_ada_final, extras=self.extras)
+
+        return x
diff --git a/models/dit/audio_dit.py b/models/dit/audio_dit.py
new file mode 100644
index 0000000000000000000000000000000000000000..643e8e82d7c44796ad6c04e109102199ead6b246
--- /dev/null
+++ b/models/dit/audio_dit.py
@@ -0,0 +1,549 @@
+import torch
+import torch.nn as nn
+from torch.utils.checkpoint import checkpoint
+
+from .mask_dit import DiTBlock, FinalBlock, UDiT
+from .modules import (
+    film_modulate,
+    PatchEmbed,
+    PE_wrapper,
+    TimestepEmbedder,
+    RMSNorm,
+)
+
+
+class AudioDiTBlock(DiTBlock):
+    """
+    A modified DiT block with time aligned context add to latent.
+    """
+    def __init__(
+        self,
+        dim,
+        ta_context_dim,
+        ta_context_norm=False,
+        context_dim=None,
+        num_heads=8,
+        mlp_ratio=4.,
+        qkv_bias=False,
+        qk_scale=None,
+        qk_norm=None,
+        act_layer='gelu',
+        norm_layer=nn.LayerNorm,
+        ta_context_fusion='add',
+        time_fusion='none',
+        ada_sola_rank=None,
+        ada_sola_alpha=None,
+        skip=False,
+        skip_norm=False,
+        rope_mode='none',
+        context_norm=False,
+        use_checkpoint=False
+    ):
+        super().__init__(
+            dim=dim,
+            context_dim=context_dim,
+            num_heads=num_heads,
+            mlp_ratio=mlp_ratio,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            qk_norm=qk_norm,
+            act_layer=act_layer,
+            norm_layer=norm_layer,
+            time_fusion=time_fusion,
+            ada_sola_rank=ada_sola_rank,
+            ada_sola_alpha=ada_sola_alpha,
+            skip=skip,
+            skip_norm=skip_norm,
+            rope_mode=rope_mode,
+            context_norm=context_norm,
+            use_checkpoint=use_checkpoint
+        )
+        self.ta_context_fusion = ta_context_fusion
+        self.ta_context_norm = ta_context_norm
+        if self.ta_context_fusion == "add":
+            self.ta_context_projection = nn.Linear(ta_context_dim, dim)
+            self.ta_context_norm = norm_layer(
+                ta_context_dim
+            ) if self.ta_context_norm else nn.Identity()
+        elif self.ta_context_fusion == "concat":
+            self.ta_context_projection = nn.Linear(ta_context_dim + dim, dim)
+            self.ta_context_norm = norm_layer(
+                ta_context_dim + dim
+            ) if self.ta_context_norm else nn.Identity()
+
+    def forward(
+        self,
+        x,
+        time_aligned_context,
+        time_token=None,
+        time_ada=None,
+        skip=None,
+        context=None,
+        x_mask=None,
+        context_mask=None,
+        extras=None
+    ):
+        if self.use_checkpoint:
+            return checkpoint(
+                self._forward,
+                x,
+                time_aligned_context,
+                time_token,
+                time_ada,
+                skip,
+                context,
+                x_mask,
+                context_mask,
+                extras,
+                use_reentrant=False
+            )
+        else:
+            return self._forward(
+                x,
+                time_aligned_context,
+                time_token,
+                time_ada,
+                skip,
+                context,
+                x_mask,
+                context_mask,
+                extras,
+            )
+
+    def _forward(
+        self,
+        x,
+        time_aligned_context,
+        time_token=None,
+        time_ada=None,
+        skip=None,
+        context=None,
+        x_mask=None,
+        context_mask=None,
+        extras=None
+    ):
+        B, T, C = x.shape
+
+        # # time aligned context
+        # if self.ta_context_fusion == "add":
+        #     time_aligned_context = self.ta_context_projection(
+        #         self.ta_context_norm(time_aligned_context)
+        #     )
+        #     x = x + time_aligned_context
+        # elif self.ta_context_fusion == "concat":
+        #     cat = torch.cat([x, time_aligned_context], dim=-1)
+        #     cat = self.ta_context_norm(cat)
+        #     x = self.ta_context_projection(cat)
+
+        # skip connection
+        if self.skip_linear is not None:
+            assert skip is not None
+            cat = torch.cat([x, skip], dim=-1)
+            cat = self.skip_norm(cat)
+            x = self.skip_linear(cat)
+        #print('skip')
+        #print(x)
+        if self.use_adanorm:
+            time_ada = self.adaln(time_token, time_ada)
+            (shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp,
+             gate_mlp) = time_ada.chunk(6, dim=1)
+
+        # self attention
+        if self.use_adanorm:
+            x_norm = film_modulate(
+                self.norm1(x), shift=shift_msa, scale=scale_msa
+            )
+            x = x + (1-gate_msa) * self.attn(
+                x_norm, context=None, context_mask=x_mask, extras=extras
+            )
+        else:
+            # TODO diffusion timestep input is not fused here
+            x = x + self.attn(
+                self.norm1(x),
+                context=None,
+                context_mask=x_mask,
+                extras=extras
+            )
+
+        # time aligned context fusion
+        if self.ta_context_fusion == "add":
+            time_aligned_context = self.ta_context_projection(
+                self.ta_context_norm(time_aligned_context)
+            )
+            x = x + time_aligned_context
+        elif self.ta_context_fusion == "concat":
+            cat = torch.cat([x, time_aligned_context], dim=-1)
+            cat = self.ta_context_norm(cat)
+            x = self.ta_context_projection(cat)
+
+        # cross attention
+        if self.use_context:
+            assert context is not None
+            x = x + self.cross_attn(
+                x=self.norm2(x),
+                context=self.norm_context(context),
+                context_mask=context_mask,
+                extras=extras
+            )
+
+        # mlp
+        if self.use_adanorm:
+            x_norm = film_modulate(
+                self.norm3(x), shift=shift_mlp, scale=scale_mlp
+            )
+            x = x + (1-gate_mlp) * self.mlp(x_norm)
+        else:
+            x = x + self.mlp(self.norm3(x))
+
+        return x
+
+
+class AudioUDiT(UDiT):
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=16,
+        in_chans=3,
+        input_type='2d',
+        out_chans=None,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=False,
+        qk_scale=None,
+        qk_norm=None,
+        act_layer='gelu',
+        norm_layer='layernorm',
+        context_norm=False,
+        use_checkpoint=False,
+        time_fusion='token',
+        ada_sola_rank=None,
+        ada_sola_alpha=None,
+        cls_dim=None,
+        ta_context_dim=768,
+        ta_context_fusion='concat',
+        ta_context_norm=True,
+        context_dim=768,
+        context_fusion='concat',
+        context_max_length=128,
+        context_pe_method='sinu',
+        pe_method='abs',
+        rope_mode='none',
+        use_conv=True,
+        skip=True,
+        skip_norm=True
+    ):
+        nn.Module.__init__(self)
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+
+        # input
+        self.in_chans = in_chans
+        self.input_type = input_type
+        if self.input_type == '2d':
+            num_patches = (img_size[0] //
+                           patch_size) * (img_size[1] // patch_size)
+        elif self.input_type == '1d':
+            num_patches = img_size // patch_size
+        self.patch_embed = PatchEmbed(
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            input_type=input_type
+        )
+        out_chans = in_chans if out_chans is None else out_chans
+        self.out_chans = out_chans
+
+        # position embedding
+        self.rope = rope_mode
+        self.x_pe = PE_wrapper(
+            dim=embed_dim, method=pe_method, length=num_patches
+        )
+
+        # time embed
+        self.time_embed = TimestepEmbedder(embed_dim)
+        self.time_fusion = time_fusion
+        self.use_adanorm = False
+
+        # cls embed
+        if cls_dim is not None:
+            self.cls_embed = nn.Sequential(
+                nn.Linear(cls_dim, embed_dim, bias=True),
+                nn.SiLU(),
+                nn.Linear(embed_dim, embed_dim, bias=True),
+            )
+        else:
+            self.cls_embed = None
+
+        # time fusion
+        if time_fusion == 'token':
+            # put token at the beginning of sequence
+            self.extras = 2 if self.cls_embed else 1
+            self.time_pe = PE_wrapper(
+                dim=embed_dim, method='abs', length=self.extras
+            )
+        elif time_fusion in ['ada', 'ada_single', 'ada_sola', 'ada_sola_bias']:
+            self.use_adanorm = True
+            # aviod  repetitive silu for each adaln block
+            self.time_act = nn.SiLU()
+            self.extras = 0
+            self.time_ada_final = nn.Linear(
+                embed_dim, 2 * embed_dim, bias=True
+            )
+            if time_fusion in ['ada_single', 'ada_sola', 'ada_sola_bias']:
+                # shared adaln
+                self.time_ada = nn.Linear(embed_dim, 6 * embed_dim, bias=True)
+            else:
+                self.time_ada = None
+        else:
+            raise NotImplementedError
+
+        # context
+        # use a simple projection
+        self.use_context = False
+        self.context_cross = False
+        self.context_max_length = context_max_length
+        self.context_fusion = 'none'
+        if context_dim is not None:
+            self.use_context = True
+            self.context_embed = nn.Sequential(
+                nn.Linear(context_dim, embed_dim, bias=True),
+                nn.SiLU(),
+                nn.Linear(embed_dim, embed_dim, bias=True),
+            )
+            self.context_fusion = context_fusion
+            if context_fusion == 'concat' or context_fusion == 'joint':
+                self.extras += context_max_length
+                self.context_pe = PE_wrapper(
+                    dim=embed_dim,
+                    method=context_pe_method,
+                    length=context_max_length
+                )
+                # no cross attention layers
+                context_dim = None
+            elif context_fusion == 'cross':
+                self.context_pe = PE_wrapper(
+                    dim=embed_dim,
+                    method=context_pe_method,
+                    length=context_max_length
+                )
+                self.context_cross = True
+                context_dim = embed_dim
+            else:
+                raise NotImplementedError
+
+        self.use_skip = skip
+
+        # norm layers
+        if norm_layer == 'layernorm':
+            norm_layer = nn.LayerNorm
+        elif norm_layer == 'rmsnorm':
+            norm_layer = RMSNorm
+        else:
+            raise NotImplementedError
+
+        self.in_blocks = nn.ModuleList([
+            AudioDiTBlock(
+                dim=embed_dim,
+                ta_context_dim=ta_context_dim,
+                ta_context_fusion=ta_context_fusion,
+                ta_context_norm=ta_context_norm,
+                context_dim=context_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                qk_norm=qk_norm,
+                act_layer=act_layer,
+                norm_layer=norm_layer,
+                time_fusion=time_fusion,
+                ada_sola_rank=ada_sola_rank,
+                ada_sola_alpha=ada_sola_alpha,
+                skip=False,
+                skip_norm=False,
+                rope_mode=self.rope,
+                context_norm=context_norm,
+                use_checkpoint=use_checkpoint
+            ) for i in range(depth // 2)
+        ])
+
+        self.mid_block = AudioDiTBlock(
+            dim=embed_dim,
+            ta_context_dim=ta_context_dim,
+            context_dim=context_dim,
+            num_heads=num_heads,
+            mlp_ratio=mlp_ratio,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            qk_norm=qk_norm,
+            act_layer=act_layer,
+            norm_layer=norm_layer,
+            time_fusion=time_fusion,
+            ada_sola_rank=ada_sola_rank,
+            ada_sola_alpha=ada_sola_alpha,
+            ta_context_fusion=ta_context_fusion,
+            ta_context_norm=ta_context_norm,
+            skip=False,
+            skip_norm=False,
+            rope_mode=self.rope,
+            context_norm=context_norm,
+            use_checkpoint=use_checkpoint
+        )
+
+        self.out_blocks = nn.ModuleList([
+            AudioDiTBlock(
+                dim=embed_dim,
+                ta_context_dim=ta_context_dim,
+                context_dim=context_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                qk_norm=qk_norm,
+                act_layer=act_layer,
+                norm_layer=norm_layer,
+                time_fusion=time_fusion,
+                ada_sola_rank=ada_sola_rank,
+                ada_sola_alpha=ada_sola_alpha,
+                ta_context_fusion=ta_context_fusion,
+                ta_context_norm=ta_context_norm,
+                skip=skip,
+                skip_norm=skip_norm,
+                rope_mode=self.rope,
+                context_norm=context_norm,
+                use_checkpoint=use_checkpoint
+            ) for i in range(depth // 2)
+        ])
+
+        # FinalLayer block
+        self.use_conv = use_conv
+        self.final_block = FinalBlock(
+            embed_dim=embed_dim,
+            patch_size=patch_size,
+            img_size=img_size,
+            in_chans=out_chans,
+            input_type=input_type,
+            norm_layer=norm_layer,
+            use_conv=use_conv,
+            use_adanorm=self.use_adanorm
+        )
+        self.initialize_weights()
+
+    def forward(
+        self,
+        x,
+        timesteps,
+        time_aligned_context,
+        context,
+        x_mask=None,
+        context_mask=None,
+        cls_token=None,
+        controlnet_skips=None,
+    ):
+        # make it compatible with int time step during inference
+        if timesteps.dim() == 0:
+            timesteps = timesteps.expand(x.shape[0]
+                                        ).to(x.device, dtype=torch.long)
+
+        x = self.patch_embed(x)
+        x = self.x_pe(x)
+
+        B, L, D = x.shape
+
+        if self.use_context:
+            context_token = self.context_embed(context)
+            context_token = self.context_pe(context_token)
+            if self.context_fusion == 'concat' or self.context_fusion == 'joint':
+                x, x_mask = self._concat_x_context(
+                    x=x,
+                    context=context_token,
+                    x_mask=x_mask,
+                    context_mask=context_mask
+                )
+                context_token, context_mask = None, None
+        else:
+            context_token, context_mask = None, None
+
+        time_token = self.time_embed(timesteps)
+        if self.cls_embed:
+            cls_token = self.cls_embed(cls_token)
+        time_ada = None
+        time_ada_final = None
+        if self.use_adanorm:
+            if self.cls_embed:
+                time_token = time_token + cls_token
+            time_token = self.time_act(time_token)
+            time_ada_final = self.time_ada_final(time_token)
+            if self.time_ada is not None:
+                time_ada = self.time_ada(time_token)
+        else:
+            time_token = time_token.unsqueeze(dim=1)
+            if self.cls_embed:
+                cls_token = cls_token.unsqueeze(dim=1)
+                time_token = torch.cat([time_token, cls_token], dim=1)
+            time_token = self.time_pe(time_token)
+            x = torch.cat((time_token, x), dim=1)
+            if x_mask is not None:
+                x_mask = torch.cat([
+                    torch.ones(B, time_token.shape[1],
+                               device=x_mask.device).bool(), x_mask
+                ],
+                                   dim=1)
+            time_token = None
+
+        skips = []
+        for blk in self.in_blocks:
+            x = blk(
+                x=x,
+                time_aligned_context=time_aligned_context,
+                time_token=time_token,
+                time_ada=time_ada,
+                skip=None,
+                context=context_token,
+                x_mask=x_mask,
+                context_mask=context_mask,
+                extras=self.extras
+            )
+
+            if self.use_skip:
+                skips.append(x)
+
+        x = self.mid_block(
+            x=x,
+            time_aligned_context=time_aligned_context,
+            time_token=time_token,
+            time_ada=time_ada,
+            skip=None,
+            context=context_token,
+            x_mask=x_mask,
+            context_mask=context_mask,
+            extras=self.extras
+        )
+
+        for blk in self.out_blocks:
+            if self.use_skip:
+                skip = skips.pop()
+                if controlnet_skips:
+                    # add to skip like u-net controlnet
+                    skip = skip + controlnet_skips.pop()
+            else:
+                skip = None
+                if controlnet_skips:
+                    # directly add to x
+                    x = x + controlnet_skips.pop()
+
+            x = blk(
+                x=x,
+                time_aligned_context=time_aligned_context,
+                time_token=time_token,
+                time_ada=time_ada,
+                skip=skip,
+                context=context_token,
+                x_mask=x_mask,
+                context_mask=context_mask,
+                extras=self.extras
+            )
+
+        x = self.final_block(x, time_ada=time_ada_final, extras=self.extras)
+
+        return x
diff --git a/models/dit/mask_dit.py b/models/dit/mask_dit.py
new file mode 100644
index 0000000000000000000000000000000000000000..949e7807e14cbbee6dae942a19a249465a5c5a91
--- /dev/null
+++ b/models/dit/mask_dit.py
@@ -0,0 +1,823 @@
+import logging
+import math
+import torch
+import torch.nn as nn
+from torch.utils.checkpoint import checkpoint
+
+from .modules import (
+    film_modulate,
+    unpatchify,
+    PatchEmbed,
+    PE_wrapper,
+    TimestepEmbedder,
+    FeedForward,
+    RMSNorm,
+)
+from .span_mask import compute_mask_indices
+from .attention import Attention
+
+logger = logging.Logger(__file__)
+
+
+class AdaLN(nn.Module):
+    def __init__(self, dim, ada_mode='ada', r=None, alpha=None):
+        super().__init__()
+        self.ada_mode = ada_mode
+        self.scale_shift_table = None
+        if ada_mode == 'ada':
+            # move nn.silu outside
+            self.time_ada = nn.Linear(dim, 6 * dim, bias=True)
+        elif ada_mode == 'ada_single':
+            # adaln used in pixel-art alpha
+            self.scale_shift_table = nn.Parameter(torch.zeros(6, dim))
+        elif ada_mode in ['ada_solo', 'ada_sola_bias']:
+            self.lora_a = nn.Linear(dim, r * 6, bias=False)
+            self.lora_b = nn.Linear(r * 6, dim * 6, bias=False)
+            self.scaling = alpha / r
+            if ada_mode == 'ada_sola_bias':
+                # take bias out for consistency
+                self.scale_shift_table = nn.Parameter(torch.zeros(6, dim))
+        else:
+            raise NotImplementedError
+
+    def forward(self, time_token=None, time_ada=None):
+        if self.ada_mode == 'ada':
+            assert time_ada is None
+            B = time_token.shape[0]
+            time_ada = self.time_ada(time_token).reshape(B, 6, -1)
+        elif self.ada_mode == 'ada_single':
+            B = time_ada.shape[0]
+            time_ada = time_ada.reshape(B, 6, -1)
+            time_ada = self.scale_shift_table[None] + time_ada
+        elif self.ada_mode in ['ada_sola', 'ada_sola_bias']:
+            B = time_ada.shape[0]
+            time_ada_lora = self.lora_b(self.lora_a(time_token)) * self.scaling
+            time_ada = time_ada + time_ada_lora
+            time_ada = time_ada.reshape(B, 6, -1)
+            if self.scale_shift_table is not None:
+                time_ada = self.scale_shift_table[None] + time_ada
+        else:
+            raise NotImplementedError
+        return time_ada
+
+
+class DiTBlock(nn.Module):
+    """
+    A modified PixArt block with adaptive layer norm (adaLN-single) conditioning.
+    """
+    def __init__(
+        self,
+        dim,
+        context_dim=None,
+        num_heads=8,
+        mlp_ratio=4.,
+        qkv_bias=False,
+        qk_scale=None,
+        qk_norm=None,
+        act_layer='gelu',
+        norm_layer=nn.LayerNorm,
+        time_fusion='none',
+        ada_sola_rank=None,
+        ada_sola_alpha=None,
+        skip=False,
+        skip_norm=False,
+        rope_mode='none',
+        context_norm=False,
+        use_checkpoint=False
+    ):
+
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim=dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            qk_norm=qk_norm,
+            rope_mode=rope_mode
+        )
+
+        if context_dim is not None:
+            self.use_context = True
+            self.cross_attn = Attention(
+                dim=dim,
+                num_heads=num_heads,
+                context_dim=context_dim,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                qk_norm=qk_norm,
+                rope_mode='none'
+            )
+            self.norm2 = norm_layer(dim)
+            if context_norm:
+                self.norm_context = norm_layer(context_dim)
+            else:
+                self.norm_context = nn.Identity()
+        else:
+            self.use_context = False
+
+        self.norm3 = norm_layer(dim)
+        self.mlp = FeedForward(
+            dim=dim, mult=mlp_ratio, activation_fn=act_layer, dropout=0
+        )
+
+        self.use_adanorm = True if time_fusion != 'token' else False
+        if self.use_adanorm:
+            self.adaln = AdaLN(
+                dim,
+                ada_mode=time_fusion,
+                r=ada_sola_rank,
+                alpha=ada_sola_alpha
+            )
+        if skip:
+            self.skip_norm = norm_layer(2 *
+                                        dim) if skip_norm else nn.Identity()
+            self.skip_linear = nn.Linear(2 * dim, dim)
+        else:
+            self.skip_linear = None
+
+        self.use_checkpoint = use_checkpoint
+
+    def forward(
+        self,
+        x,
+        time_token=None,
+        time_ada=None,
+        skip=None,
+        context=None,
+        x_mask=None,
+        context_mask=None,
+        extras=None
+    ):
+        if self.use_checkpoint:
+            return checkpoint(
+                self._forward,
+                x,
+                time_token,
+                time_ada,
+                skip,
+                context,
+                x_mask,
+                context_mask,
+                extras,
+                use_reentrant=False
+            )
+        else:
+            return self._forward(
+                x, time_token, time_ada, skip, context, x_mask, context_mask,
+                extras
+            )
+
+    def _forward(
+        self,
+        x,
+        time_token=None,
+        time_ada=None,
+        skip=None,
+        context=None,
+        x_mask=None,
+        context_mask=None,
+        extras=None
+    ):
+        B, T, C = x.shape
+        if self.skip_linear is not None:
+            assert skip is not None
+            cat = torch.cat([x, skip], dim=-1)
+            cat = self.skip_norm(cat)
+            x = self.skip_linear(cat)
+
+        if self.use_adanorm:
+            time_ada = self.adaln(time_token, time_ada)
+            (shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp,
+             gate_mlp) = time_ada.chunk(6, dim=1)
+
+        # self attention
+        if self.use_adanorm:
+            x_norm = film_modulate(
+                self.norm1(x), shift=shift_msa, scale=scale_msa
+            )
+            x = x + (1-gate_msa) * self.attn(
+                x_norm, context=None, context_mask=x_mask, extras=extras
+            )
+        else:
+            x = x + self.attn(
+                self.norm1(x),
+                context=None,
+                context_mask=x_mask,
+                extras=extras
+            )
+
+        # cross attention
+        if self.use_context:
+            assert context is not None
+            x = x + self.cross_attn(
+                x=self.norm2(x),
+                context=self.norm_context(context),
+                context_mask=context_mask,
+                extras=extras
+            )
+
+        # mlp
+        if self.use_adanorm:
+            x_norm = film_modulate(
+                self.norm3(x), shift=shift_mlp, scale=scale_mlp
+            )
+            x = x + (1-gate_mlp) * self.mlp(x_norm)
+        else:
+            x = x + self.mlp(self.norm3(x))
+
+        return x
+
+
+class FinalBlock(nn.Module):
+    def __init__(
+        self,
+        embed_dim,
+        patch_size,
+        in_chans,
+        img_size,
+        input_type='2d',
+        norm_layer=nn.LayerNorm,
+        use_conv=True,
+        use_adanorm=True
+    ):
+        super().__init__()
+        self.in_chans = in_chans
+        self.img_size = img_size
+        self.input_type = input_type
+
+        self.norm = norm_layer(embed_dim)
+        if use_adanorm:
+            self.use_adanorm = True
+        else:
+            self.use_adanorm = False
+
+        if input_type == '2d':
+            self.patch_dim = patch_size**2 * in_chans
+            self.linear = nn.Linear(embed_dim, self.patch_dim, bias=True)
+            if use_conv:
+                self.final_layer = nn.Conv2d(
+                    self.in_chans, self.in_chans, 3, padding=1
+                )
+            else:
+                self.final_layer = nn.Identity()
+
+        elif input_type == '1d':
+            self.patch_dim = patch_size * in_chans
+            self.linear = nn.Linear(embed_dim, self.patch_dim, bias=True)
+            if use_conv:
+                self.final_layer = nn.Conv1d(
+                    self.in_chans, self.in_chans, 3, padding=1
+                )
+            else:
+                self.final_layer = nn.Identity()
+
+    def forward(self, x, time_ada=None, extras=0):
+        B, T, C = x.shape
+        x = x[:, extras:, :]
+        # only handle generation target
+        if self.use_adanorm:
+            shift, scale = time_ada.reshape(B, 2, -1).chunk(2, dim=1)
+            x = film_modulate(self.norm(x), shift, scale)
+        else:
+            x = self.norm(x)
+        x = self.linear(x)
+        x = unpatchify(x, self.in_chans, self.input_type, self.img_size)
+        x = self.final_layer(x)
+        return x
+
+
+class UDiT(nn.Module):
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=16,
+        in_chans=3,
+        input_type='2d',
+        out_chans=None,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4.,
+        qkv_bias=False,
+        qk_scale=None,
+        qk_norm=None,
+        act_layer='gelu',
+        norm_layer='layernorm',
+        context_norm=False,
+        use_checkpoint=False,
+        # time fusion ada or token
+        time_fusion='token',
+        ada_sola_rank=None,
+        ada_sola_alpha=None,
+        cls_dim=None,
+        # max length is only used for concat
+        context_dim=768,
+        context_fusion='concat',
+        context_max_length=128,
+        context_pe_method='sinu',
+        pe_method='abs',
+        rope_mode='none',
+        use_conv=True,
+        skip=True,
+        skip_norm=True
+    ):
+        super().__init__()
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+
+        # input
+        self.in_chans = in_chans
+        self.input_type = input_type
+        if self.input_type == '2d':
+            num_patches = (img_size[0] //
+                           patch_size) * (img_size[1] // patch_size)
+        elif self.input_type == '1d':
+            num_patches = img_size // patch_size
+        self.patch_embed = PatchEmbed(
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            input_type=input_type
+        )
+        out_chans = in_chans if out_chans is None else out_chans
+        self.out_chans = out_chans
+
+        # position embedding
+        self.rope = rope_mode
+        self.x_pe = PE_wrapper(
+            dim=embed_dim, method=pe_method, length=num_patches
+        )
+
+        logger.info(f'x position embedding: {pe_method}')
+        logger.info(f'rope mode: {self.rope}')
+
+        # time embed
+        self.time_embed = TimestepEmbedder(embed_dim)
+        self.time_fusion = time_fusion
+        self.use_adanorm = False
+
+        # cls embed
+        if cls_dim is not None:
+            self.cls_embed = nn.Sequential(
+                nn.Linear(cls_dim, embed_dim, bias=True),
+                nn.SiLU(),
+                nn.Linear(embed_dim, embed_dim, bias=True),
+            )
+        else:
+            self.cls_embed = None
+
+        # time fusion
+        if time_fusion == 'token':
+            # put token at the beginning of sequence
+            self.extras = 2 if self.cls_embed else 1
+            self.time_pe = PE_wrapper(
+                dim=embed_dim, method='abs', length=self.extras
+            )
+        elif time_fusion in ['ada', 'ada_single', 'ada_sola', 'ada_sola_bias']:
+            self.use_adanorm = True
+            # aviod  repetitive silu for each adaln block
+            self.time_act = nn.SiLU()
+            self.extras = 0
+            self.time_ada_final = nn.Linear(
+                embed_dim, 2 * embed_dim, bias=True
+            )
+            if time_fusion in ['ada_single', 'ada_sola', 'ada_sola_bias']:
+                # shared adaln
+                self.time_ada = nn.Linear(embed_dim, 6 * embed_dim, bias=True)
+            else:
+                self.time_ada = None
+        else:
+            raise NotImplementedError
+        logger.info(f'time fusion mode: {self.time_fusion}')
+
+        # context
+        # use a simple projection
+        self.use_context = False
+        self.context_cross = False
+        self.context_max_length = context_max_length
+        self.context_fusion = 'none'
+        if context_dim is not None:
+            self.use_context = True
+            self.context_embed = nn.Sequential(
+                nn.Linear(context_dim, embed_dim, bias=True),
+                nn.SiLU(),
+                nn.Linear(embed_dim, embed_dim, bias=True),
+            )
+            self.context_fusion = context_fusion
+            if context_fusion == 'concat' or context_fusion == 'joint':
+                self.extras += context_max_length
+                self.context_pe = PE_wrapper(
+                    dim=embed_dim,
+                    method=context_pe_method,
+                    length=context_max_length
+                )
+                # no cross attention layers
+                context_dim = None
+            elif context_fusion == 'cross':
+                self.context_pe = PE_wrapper(
+                    dim=embed_dim,
+                    method=context_pe_method,
+                    length=context_max_length
+                )
+                self.context_cross = True
+                context_dim = embed_dim
+            else:
+                raise NotImplementedError
+        logger.info(f'context fusion mode: {context_fusion}')
+        logger.info(f'context position embedding: {context_pe_method}')
+
+        self.use_skip = skip
+
+        # norm layers
+        if norm_layer == 'layernorm':
+            norm_layer = nn.LayerNorm
+        elif norm_layer == 'rmsnorm':
+            norm_layer = RMSNorm
+        else:
+            raise NotImplementedError
+
+        logger.info(f'use long skip connection: {skip}')
+        self.in_blocks = nn.ModuleList([
+            DiTBlock(
+                dim=embed_dim,
+                context_dim=context_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                qk_norm=qk_norm,
+                act_layer=act_layer,
+                norm_layer=norm_layer,
+                time_fusion=time_fusion,
+                ada_sola_rank=ada_sola_rank,
+                ada_sola_alpha=ada_sola_alpha,
+                skip=False,
+                skip_norm=False,
+                rope_mode=self.rope,
+                context_norm=context_norm,
+                use_checkpoint=use_checkpoint
+            ) for _ in range(depth // 2)
+        ])
+
+        self.mid_block = DiTBlock(
+            dim=embed_dim,
+            context_dim=context_dim,
+            num_heads=num_heads,
+            mlp_ratio=mlp_ratio,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            qk_norm=qk_norm,
+            act_layer=act_layer,
+            norm_layer=norm_layer,
+            time_fusion=time_fusion,
+            ada_sola_rank=ada_sola_rank,
+            ada_sola_alpha=ada_sola_alpha,
+            skip=False,
+            skip_norm=False,
+            rope_mode=self.rope,
+            context_norm=context_norm,
+            use_checkpoint=use_checkpoint
+        )
+
+        self.out_blocks = nn.ModuleList([
+            DiTBlock(
+                dim=embed_dim,
+                context_dim=context_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                qk_norm=qk_norm,
+                act_layer=act_layer,
+                norm_layer=norm_layer,
+                time_fusion=time_fusion,
+                ada_sola_rank=ada_sola_rank,
+                ada_sola_alpha=ada_sola_alpha,
+                skip=skip,
+                skip_norm=skip_norm,
+                rope_mode=self.rope,
+                context_norm=context_norm,
+                use_checkpoint=use_checkpoint
+            ) for _ in range(depth // 2)
+        ])
+
+        # FinalLayer block
+        self.use_conv = use_conv
+        self.final_block = FinalBlock(
+            embed_dim=embed_dim,
+            patch_size=patch_size,
+            img_size=img_size,
+            in_chans=out_chans,
+            input_type=input_type,
+            norm_layer=norm_layer,
+            use_conv=use_conv,
+            use_adanorm=self.use_adanorm
+        )
+        self.initialize_weights()
+
+    def _init_ada(self):
+        if self.time_fusion == 'ada':
+            nn.init.constant_(self.time_ada_final.weight, 0)
+            nn.init.constant_(self.time_ada_final.bias, 0)
+            for block in self.in_blocks:
+                nn.init.constant_(block.adaln.time_ada.weight, 0)
+                nn.init.constant_(block.adaln.time_ada.bias, 0)
+            nn.init.constant_(self.mid_block.adaln.time_ada.weight, 0)
+            nn.init.constant_(self.mid_block.adaln.time_ada.bias, 0)
+            for block in self.out_blocks:
+                nn.init.constant_(block.adaln.time_ada.weight, 0)
+                nn.init.constant_(block.adaln.time_ada.bias, 0)
+        elif self.time_fusion == 'ada_single':
+            nn.init.constant_(self.time_ada.weight, 0)
+            nn.init.constant_(self.time_ada.bias, 0)
+            nn.init.constant_(self.time_ada_final.weight, 0)
+            nn.init.constant_(self.time_ada_final.bias, 0)
+        elif self.time_fusion in ['ada_sola', 'ada_sola_bias']:
+            nn.init.constant_(self.time_ada.weight, 0)
+            nn.init.constant_(self.time_ada.bias, 0)
+            nn.init.constant_(self.time_ada_final.weight, 0)
+            nn.init.constant_(self.time_ada_final.bias, 0)
+            for block in self.in_blocks:
+                nn.init.kaiming_uniform_(
+                    block.adaln.lora_a.weight, a=math.sqrt(5)
+                )
+                nn.init.constant_(block.adaln.lora_b.weight, 0)
+            nn.init.kaiming_uniform_(
+                self.mid_block.adaln.lora_a.weight, a=math.sqrt(5)
+            )
+            nn.init.constant_(self.mid_block.adaln.lora_b.weight, 0)
+            for block in self.out_blocks:
+                nn.init.kaiming_uniform_(
+                    block.adaln.lora_a.weight, a=math.sqrt(5)
+                )
+                nn.init.constant_(block.adaln.lora_b.weight, 0)
+
+    def initialize_weights(self):
+        # Basic init for all layers
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+
+        self.apply(_basic_init)
+
+        # init patch Conv like Linear
+        w = self.patch_embed.proj.weight.data
+        nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+        nn.init.constant_(self.patch_embed.proj.bias, 0)
+
+        # Zero-out AdaLN
+        if self.use_adanorm:
+            self._init_ada()
+
+        # Zero-out Cross Attention
+        if self.context_cross:
+            for block in self.in_blocks:
+                nn.init.constant_(block.cross_attn.proj.weight, 0)
+                nn.init.constant_(block.cross_attn.proj.bias, 0)
+            nn.init.constant_(self.mid_block.cross_attn.proj.weight, 0)
+            nn.init.constant_(self.mid_block.cross_attn.proj.bias, 0)
+            for block in self.out_blocks:
+                nn.init.constant_(block.cross_attn.proj.weight, 0)
+                nn.init.constant_(block.cross_attn.proj.bias, 0)
+
+        # Zero-out cls embedding
+        if self.cls_embed:
+            if self.use_adanorm:
+                nn.init.constant_(self.cls_embed[-1].weight, 0)
+                nn.init.constant_(self.cls_embed[-1].bias, 0)
+
+        # Zero-out Output
+        # might not zero-out this when using v-prediction
+        # it could be good when using noise-prediction
+        # nn.init.constant_(self.final_block.linear.weight, 0)
+        # nn.init.constant_(self.final_block.linear.bias, 0)
+        # if self.use_conv:
+        #     nn.init.constant_(self.final_block.final_layer.weight.data, 0)
+        #     nn.init.constant_(self.final_block.final_layer.bias, 0)
+
+        # init out Conv
+        if self.use_conv:
+            nn.init.xavier_uniform_(self.final_block.final_layer.weight)
+            nn.init.constant_(self.final_block.final_layer.bias, 0)
+
+    def _concat_x_context(self, x, context, x_mask=None, context_mask=None):
+        assert context.shape[-2] == self.context_max_length
+        # Check if either x_mask or context_mask is provided
+        B = x.shape[0]
+        # Create default masks if they are not provided
+        if x_mask is None:
+            x_mask = torch.ones(B, x.shape[-2], device=x.device).bool()
+        if context_mask is None:
+            context_mask = torch.ones(
+                B, context.shape[-2], device=context.device
+            ).bool()
+        # Concatenate the masks along the second dimension (dim=1)
+        x_mask = torch.cat([context_mask, x_mask], dim=1)
+        # Concatenate context and x along the second dimension (dim=1)
+        x = torch.cat((context, x), dim=1)
+        return x, x_mask
+
+    def forward(
+        self,
+        x,
+        timesteps,
+        context,
+        x_mask=None,
+        context_mask=None,
+        cls_token=None,
+        controlnet_skips=None,
+    ):
+        # make it compatible with int time step during inference
+        if timesteps.dim() == 0:
+            timesteps = timesteps.expand(x.shape[0]
+                                        ).to(x.device, dtype=torch.long)
+
+        x = self.patch_embed(x)
+        x = self.x_pe(x)
+
+        B, L, D = x.shape
+
+        if self.use_context:
+            context_token = self.context_embed(context)
+            context_token = self.context_pe(context_token)
+            if self.context_fusion == 'concat' or self.context_fusion == 'joint':
+                x, x_mask = self._concat_x_context(
+                    x=x,
+                    context=context_token,
+                    x_mask=x_mask,
+                    context_mask=context_mask
+                )
+                context_token, context_mask = None, None
+        else:
+            context_token, context_mask = None, None
+
+        time_token = self.time_embed(timesteps)
+        if self.cls_embed:
+            cls_token = self.cls_embed(cls_token)
+        time_ada = None
+        time_ada_final = None
+        if self.use_adanorm:
+            if self.cls_embed:
+                time_token = time_token + cls_token
+            time_token = self.time_act(time_token)
+            time_ada_final = self.time_ada_final(time_token)
+            if self.time_ada is not None:
+                time_ada = self.time_ada(time_token)
+        else:
+            time_token = time_token.unsqueeze(dim=1)
+            if self.cls_embed:
+                cls_token = cls_token.unsqueeze(dim=1)
+                time_token = torch.cat([time_token, cls_token], dim=1)
+            time_token = self.time_pe(time_token)
+            x = torch.cat((time_token, x), dim=1)
+            if x_mask is not None:
+                x_mask = torch.cat([
+                    torch.ones(B, time_token.shape[1],
+                               device=x_mask.device).bool(), x_mask
+                ],
+                                   dim=1)
+            time_token = None
+
+        skips = []
+        for blk in self.in_blocks:
+            x = blk(
+                x=x,
+                time_token=time_token,
+                time_ada=time_ada,
+                skip=None,
+                context=context_token,
+                x_mask=x_mask,
+                context_mask=context_mask,
+                extras=self.extras
+            )
+            if self.use_skip:
+                skips.append(x)
+
+        x = self.mid_block(
+            x=x,
+            time_token=time_token,
+            time_ada=time_ada,
+            skip=None,
+            context=context_token,
+            x_mask=x_mask,
+            context_mask=context_mask,
+            extras=self.extras
+        )
+        for blk in self.out_blocks:
+            if self.use_skip:
+                skip = skips.pop()
+                if controlnet_skips:
+                    # add to skip like u-net controlnet
+                    skip = skip + controlnet_skips.pop()
+            else:
+                skip = None
+                if controlnet_skips:
+                    # directly add to x
+                    x = x + controlnet_skips.pop()
+
+            x = blk(
+                x=x,
+                time_token=time_token,
+                time_ada=time_ada,
+                skip=skip,
+                context=context_token,
+                x_mask=x_mask,
+                context_mask=context_mask,
+                extras=self.extras
+            )
+
+        x = self.final_block(x, time_ada=time_ada_final, extras=self.extras)
+
+        return x
+
+
+class MaskDiT(nn.Module):
+    def __init__(
+        self,
+        model: UDiT,
+        mae=False,
+        mae_prob=0.5,
+        mask_ratio=[0.25, 1.0],
+        mask_span=10,
+    ):
+        super().__init__()
+        self.model = model
+        self.mae = mae
+        if self.mae:
+            out_channel = model.out_chans
+            self.mask_embed = nn.Parameter(torch.zeros((out_channel)))
+            self.mae_prob = mae_prob
+            self.mask_ratio = mask_ratio
+            self.mask_span = mask_span
+
+    def random_masking(self, gt, mask_ratios, mae_mask_infer=None):
+        B, D, L = gt.shape
+        if mae_mask_infer is None:
+            # mask = torch.rand(B, L).to(gt.device) < mask_ratios.unsqueeze(1)
+            mask_ratios = mask_ratios.cpu().numpy()
+            mask = compute_mask_indices(
+                shape=[B, L],
+                padding_mask=None,
+                mask_prob=mask_ratios,
+                mask_length=self.mask_span,
+                mask_type="static",
+                mask_other=0.0,
+                min_masks=1,
+                no_overlap=False,
+                min_space=0,
+            )
+            mask = mask.unsqueeze(1).expand_as(gt)
+        else:
+            mask = mae_mask_infer
+            mask = mask.expand_as(gt)
+        gt[mask] = self.mask_embed.view(1, D, 1).expand_as(gt)[mask]
+        return gt, mask.type_as(gt)
+
+    def forward(
+        self,
+        x,
+        timesteps,
+        context,
+        x_mask=None,
+        context_mask=None,
+        cls_token=None,
+        gt=None,
+        mae_mask_infer=None,
+        forward_model=True
+    ):
+        # todo: handle controlnet inside
+        mae_mask = torch.ones_like(x)
+        if self.mae:
+            if gt is not None:
+                B, D, L = gt.shape
+                mask_ratios = torch.FloatTensor(B).uniform_(*self.mask_ratio
+                                                           ).to(gt.device)
+                gt, mae_mask = self.random_masking(
+                    gt, mask_ratios, mae_mask_infer
+                )
+                # apply mae only to the selected batches
+                if mae_mask_infer is None:
+                    # determine mae batch
+                    mae_batch = torch.rand(B) < self.mae_prob
+                    gt[~mae_batch] = self.mask_embed.view(
+                        1, D, 1
+                    ).expand_as(gt)[~mae_batch]
+                    mae_mask[~mae_batch] = 1.0
+            else:
+                B, D, L = x.shape
+                gt = self.mask_embed.view(1, D, 1).expand_as(x)
+            x = torch.cat([x, gt, mae_mask[:, 0:1, :]], dim=1)
+
+        if forward_model:
+            x = self.model(
+                x=x,
+                timesteps=timesteps,
+                context=context,
+                x_mask=x_mask,
+                context_mask=context_mask,
+                cls_token=cls_token
+            )
+            # logger.info(mae_mask[:, 0, :].sum(dim=-1))
+        return x, mae_mask
diff --git a/models/dit/modules.py b/models/dit/modules.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2eec357ca6dfebf841768874b8cbd37112c3980
--- /dev/null
+++ b/models/dit/modules.py
@@ -0,0 +1,445 @@
+import warnings
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch.cuda.amp import autocast
+import math
+import einops
+from einops import rearrange, repeat
+from inspect import isfunction
+
+
+def trunc_normal_(tensor, mean, std, a, b):
+    # Cut & paste from PyTorch official master until it's in a few official releases - RW
+    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
+    def norm_cdf(x):
+        # Computes standard normal cumulative distribution function
+        return (1. + math.erf(x / math.sqrt(2.))) / 2.
+
+    if (mean < a - 2*std) or (mean > b + 2*std):
+        warnings.warn(
+            "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
+            "The distribution of values may be incorrect.",
+            stacklevel=2
+        )
+
+    with torch.no_grad():
+        # Values are generated by using a truncated uniform distribution and
+        # then using the inverse CDF for the normal distribution.
+        # Get upper and lower cdf values
+        l = norm_cdf((a-mean) / std)
+        u = norm_cdf((b-mean) / std)
+
+        # Uniformly fill tensor with values from [l, u], then translate to
+        # [2l-1, 2u-1].
+        tensor.uniform_(2*l - 1, 2*u - 1)
+
+        # Use inverse cdf transform for normal distribution to get truncated
+        # standard normal
+        tensor.erfinv_()
+
+        # Transform to proper mean, std
+        tensor.mul_(std * math.sqrt(2.))
+        tensor.add_(mean)
+
+        # Clamp to ensure it's in the proper range
+        tensor.clamp_(min=a, max=b)
+        return tensor
+
+
+# disable in checkpoint mode
+# @torch.jit.script
+def film_modulate(x, shift, scale):
+    return x * (1+scale) + shift
+
+
+def timestep_embedding(timesteps, dim, max_period=10000):
+    """
+    Create sinusoidal timestep embeddings.
+
+    :param timesteps: a 1-D Tensor of N indices, one per batch element.
+                      These may be fractional.
+    :param dim: the dimension of the output.
+    :param max_period: controls the minimum frequency of the embeddings.
+    :return: an [N x dim] Tensor of positional embeddings.
+    """
+    half = dim // 2
+    freqs = torch.exp(
+        -math.log(max_period) *
+        torch.arange(start=0, end=half, dtype=torch.float32) / half
+    ).to(device=timesteps.device)
+    args = timesteps[:, None].float() * freqs[None]
+    embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+    if dim % 2:
+        embedding = torch.cat([embedding,
+                               torch.zeros_like(embedding[:, :1])],
+                              dim=-1)
+    return embedding
+
+
+class TimestepEmbedder(nn.Module):
+    """
+    Embeds scalar timesteps into vector representations.
+    """
+    def __init__(
+        self, hidden_size, frequency_embedding_size=256, out_size=None
+    ):
+        super().__init__()
+        if out_size is None:
+            out_size = hidden_size
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_size, bias=True),
+            nn.SiLU(),
+            nn.Linear(hidden_size, out_size, bias=True),
+        )
+        self.frequency_embedding_size = frequency_embedding_size
+
+    def forward(self, t):
+        t_freq = timestep_embedding(t, self.frequency_embedding_size).type(
+            self.mlp[0].weight.dtype
+        )
+        t_emb = self.mlp(t_freq)
+        return t_emb
+
+
+def patchify(imgs, patch_size, input_type='2d'):
+    if input_type == '2d':
+        x = einops.rearrange(
+            imgs,
+            'B C (h p1) (w p2) -> B (h w) (p1 p2 C)',
+            p1=patch_size,
+            p2=patch_size
+        )
+    elif input_type == '1d':
+        x = einops.rearrange(imgs, 'B C (h p1) -> B h (p1 C)', p1=patch_size)
+    return x
+
+
+def unpatchify(x, channels=3, input_type='2d', img_size=None):
+    if input_type == '2d':
+        patch_size = int((x.shape[2] // channels)**0.5)
+        # h = w = int(x.shape[1] ** .5)
+        h, w = img_size[0] // patch_size, img_size[1] // patch_size
+        assert h * w == x.shape[1] and patch_size**2 * channels == x.shape[2]
+        x = einops.rearrange(
+            x,
+            'B (h w) (p1 p2 C) -> B C (h p1) (w p2)',
+            h=h,
+            p1=patch_size,
+            p2=patch_size
+        )
+    elif input_type == '1d':
+        patch_size = int((x.shape[2] // channels))
+        h = x.shape[1]
+        assert patch_size * channels == x.shape[2]
+        x = einops.rearrange(x, 'B h (p1 C) -> B C (h p1)', h=h, p1=patch_size)
+    return x
+
+
+class PatchEmbed(nn.Module):
+    """
+     Image to Patch Embedding
+    """
+    def __init__(self, patch_size, in_chans=3, embed_dim=768, input_type='2d'):
+        super().__init__()
+        self.patch_size = patch_size
+        self.input_type = input_type
+        if input_type == '2d':
+            self.proj = nn.Conv2d(
+                in_chans,
+                embed_dim,
+                kernel_size=patch_size,
+                stride=patch_size,
+                bias=True
+            )
+        elif input_type == '1d':
+            self.proj = nn.Conv1d(
+                in_chans,
+                embed_dim,
+                kernel_size=patch_size,
+                stride=patch_size,
+                bias=True
+            )
+
+    def forward(self, x):
+        if self.input_type == '2d':
+            B, C, H, W = x.shape
+            assert H % self.patch_size == 0 and W % self.patch_size == 0
+        elif self.input_type == '1d':
+            B, C, H = x.shape
+            assert H % self.patch_size == 0
+
+        x = self.proj(x).flatten(2).transpose(1, 2)
+        return x
+
+
+class PositionalConvEmbedding(nn.Module):
+    """
+    Relative positional embedding used in HuBERT
+    """
+    def __init__(self, dim=768, kernel_size=128, groups=16):
+        super().__init__()
+        self.conv = nn.Conv1d(
+            dim,
+            dim,
+            kernel_size=kernel_size,
+            padding=kernel_size // 2,
+            groups=groups,
+            bias=True
+        )
+        self.conv = nn.utils.parametrizations.weight_norm(
+            self.conv, name="weight", dim=2
+        )
+
+    def forward(self, x):
+        # B C T
+        x = self.conv(x)
+        x = F.gelu(x[:, :, :-1])
+        return x
+
+
+class SinusoidalPositionalEncoding(nn.Module):
+    def __init__(self, dim, length):
+        super(SinusoidalPositionalEncoding, self).__init__()
+        self.length = length
+        self.dim = dim
+        self.register_buffer(
+            'pe', self._generate_positional_encoding(length, dim)
+        )
+
+    def _generate_positional_encoding(self, length, dim):
+        pe = torch.zeros(length, dim)
+        position = torch.arange(0, length, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, dim, 2).float() * (-math.log(10000.0) / dim)
+        )
+
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+
+        pe = pe.unsqueeze(0)
+        return pe
+
+    def forward(self, x):
+        x = x + self.pe[:, :x.size(1)]
+        return x
+
+
+class PE_wrapper(nn.Module):
+    def __init__(self, dim=768, method='abs', length=None, **kwargs):
+        super().__init__()
+        self.method = method
+        if method == 'abs':
+            # init absolute pe like UViT
+            self.length = length
+            self.abs_pe = nn.Parameter(torch.zeros(1, length, dim))
+            trunc_normal_(self.abs_pe, std=.02)
+        elif method == 'conv':
+            self.conv_pe = PositionalConvEmbedding(dim=dim, **kwargs)
+        elif method == 'sinu':
+            self.sinu_pe = SinusoidalPositionalEncoding(dim=dim, length=length)
+        elif method == 'none':
+            # skip pe
+            self.id = nn.Identity()
+        else:
+            raise NotImplementedError
+
+    def forward(self, x):
+        if self.method == 'abs':
+            _, L, _ = x.shape
+            assert L <= self.length
+            x = x + self.abs_pe[:, :L, :]
+        elif self.method == 'conv':
+            x = x + self.conv_pe(x)
+        elif self.method == 'sinu':
+            x = self.sinu_pe(x)
+        elif self.method == 'none':
+            x = self.id(x)
+        else:
+            raise NotImplementedError
+        return x
+
+
+class RMSNorm(torch.nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        """
+        Initialize the RMSNorm normalization layer.
+
+        Args:
+            dim (int): The dimension of the input tensor.
+            eps (float, optional): A small value added to the denominator for numerical stability. Default is 1e-6.
+
+        Attributes:
+            eps (float): A small value added to the denominator for numerical stability.
+            weight (nn.Parameter): Learnable scaling parameter.
+
+        """
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+
+    def _norm(self, x):
+        """
+        Apply the RMSNorm normalization to the input tensor.
+
+        Args:
+            x (torch.Tensor): The input tensor.
+
+        Returns:
+            torch.Tensor: The normalized tensor.
+
+        """
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+
+    def forward(self, x):
+        """
+        Forward pass through the RMSNorm layer.
+
+        Args:
+            x (torch.Tensor): The input tensor.
+
+        Returns:
+            torch.Tensor: The output tensor after applying RMSNorm.
+
+        """
+        output = self._norm(x.float()).type_as(x)
+        return output * self.weight
+
+
+class GELU(nn.Module):
+    def __init__(
+        self,
+        dim_in: int,
+        dim_out: int,
+        approximate: str = "none",
+        bias: bool = True
+    ):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out, bias=bias)
+        self.approximate = approximate
+
+    def gelu(self, gate: torch.Tensor) -> torch.Tensor:
+        if gate.device.type != "mps":
+            return F.gelu(gate, approximate=self.approximate)
+        # mps: gelu is not implemented for float16
+        return F.gelu(
+            gate.to(dtype=torch.float32), approximate=self.approximate
+        ).to(dtype=gate.dtype)
+
+    def forward(self, hidden_states):
+        hidden_states = self.proj(hidden_states)
+        hidden_states = self.gelu(hidden_states)
+        return hidden_states
+
+
+class GEGLU(nn.Module):
+    def __init__(self, dim_in: int, dim_out: int, bias: bool = True):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out * 2, bias=bias)
+
+    def gelu(self, gate: torch.Tensor) -> torch.Tensor:
+        if gate.device.type != "mps":
+            return F.gelu(gate)
+        # mps: gelu is not implemented for float16
+        return F.gelu(gate.to(dtype=torch.float32)).to(dtype=gate.dtype)
+
+    def forward(self, hidden_states):
+        hidden_states = self.proj(hidden_states)
+        hidden_states, gate = hidden_states.chunk(2, dim=-1)
+        return hidden_states * self.gelu(gate)
+
+
+class ApproximateGELU(nn.Module):
+    def __init__(self, dim_in: int, dim_out: int, bias: bool = True):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out, bias=bias)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.proj(x)
+        return x * torch.sigmoid(1.702 * x)
+
+
+# disable in checkpoint mode
+# @torch.jit.script
+def snake_beta(x, alpha, beta):
+    return x + beta * torch.sin(x * alpha).pow(2)
+
+
+class Snake(nn.Module):
+    def __init__(self, dim_in, dim_out, bias, alpha_trainable=True):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out, bias=bias)
+        self.alpha = nn.Parameter(torch.ones(1, 1, dim_out))
+        self.beta = nn.Parameter(torch.ones(1, 1, dim_out))
+        self.alpha.requires_grad = alpha_trainable
+        self.beta.requires_grad = alpha_trainable
+
+    def forward(self, x):
+        x = self.proj(x)
+        x = snake_beta(x, self.alpha, self.beta)
+        return x
+
+
+class GESnake(nn.Module):
+    def __init__(self, dim_in, dim_out, bias, alpha_trainable=True):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out * 2, bias=bias)
+        self.alpha = nn.Parameter(torch.ones(1, 1, dim_out))
+        self.beta = nn.Parameter(torch.ones(1, 1, dim_out))
+        self.alpha.requires_grad = alpha_trainable
+        self.beta.requires_grad = alpha_trainable
+
+    def forward(self, x):
+        x = self.proj(x)
+        x, gate = x.chunk(2, dim=-1)
+        return x * snake_beta(gate, self.alpha, self.beta)
+
+
+class FeedForward(nn.Module):
+    def __init__(
+        self,
+        dim,
+        dim_out=None,
+        mult=4,
+        dropout=0.0,
+        activation_fn="geglu",
+        final_dropout=False,
+        inner_dim=None,
+        bias=True,
+    ):
+        super().__init__()
+        if inner_dim is None:
+            inner_dim = int(dim * mult)
+        dim_out = dim_out if dim_out is not None else dim
+
+        if activation_fn == "gelu":
+            act_fn = GELU(dim, inner_dim, bias=bias)
+        elif activation_fn == "gelu-approximate":
+            act_fn = GELU(dim, inner_dim, approximate="tanh", bias=bias)
+        elif activation_fn == "geglu":
+            act_fn = GEGLU(dim, inner_dim, bias=bias)
+        elif activation_fn == "geglu-approximate":
+            act_fn = ApproximateGELU(dim, inner_dim, bias=bias)
+        elif activation_fn == "snake":
+            act_fn = Snake(dim, inner_dim, bias=bias)
+        elif activation_fn == "gesnake":
+            act_fn = GESnake(dim, inner_dim, bias=bias)
+        else:
+            raise NotImplementedError
+
+        self.net = nn.ModuleList([])
+        # project in
+        self.net.append(act_fn)
+        # project dropout
+        self.net.append(nn.Dropout(dropout))
+        # project out
+        self.net.append(nn.Linear(inner_dim, dim_out, bias=bias))
+        # FF as used in Vision Transformer, MLP-Mixer, etc. have a final dropout
+        if final_dropout:
+            self.net.append(nn.Dropout(dropout))
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        for module in self.net:
+            hidden_states = module(hidden_states)
+        return hidden_states
diff --git a/models/dit/rotary.py b/models/dit/rotary.py
new file mode 100644
index 0000000000000000000000000000000000000000..f539185c22e715d5a7ac66772ffa9f15a1e5df35
--- /dev/null
+++ b/models/dit/rotary.py
@@ -0,0 +1,88 @@
+import torch
+"this rope is faster than llama rope with jit script"
+
+
+def rotate_half(x):
+    x1, x2 = x.chunk(2, dim=-1)
+    return torch.cat((-x2, x1), dim=-1)
+
+
+# disable in checkpoint mode
+# @torch.jit.script
+def apply_rotary_pos_emb(x, cos, sin):
+    # NOTE: This could probably be moved to Triton
+    # Handle a possible sequence length mismatch in between q and k
+    cos = cos[:, :, :x.shape[-2], :]
+    sin = sin[:, :, :x.shape[-2], :]
+    return (x*cos) + (rotate_half(x) * sin)
+
+
+class RotaryEmbedding(torch.nn.Module):
+    """
+    The rotary position embeddings from RoFormer_ (Su et. al).
+    A crucial insight from the method is that the query and keys are
+    transformed by rotation matrices which depend on the relative positions.
+
+    Other implementations are available in the Rotary Transformer repo_ and in
+    GPT-NeoX_, GPT-NeoX was an inspiration
+
+    .. _RoFormer: https://arxiv.org/abs/2104.09864
+    .. _repo: https://github.com/ZhuiyiTechnology/roformer
+    .. _GPT-NeoX: https://github.com/EleutherAI/gpt-neox
+
+
+    .. warning: Please note that this embedding is not registered on purpose, as it is transformative
+        (it does not create the embedding dimension) and will likely be picked up (imported) on a ad-hoc basis
+    """
+    def __init__(self, dim: int):
+        super().__init__()
+        # Generate and save the inverse frequency buffer (non trainable)
+        inv_freq = 1.0 / (10000**(torch.arange(0, dim, 2).float() / dim))
+        self.register_buffer("inv_freq", inv_freq)
+        self._seq_len_cached = None
+        self._cos_cached = None
+        self._sin_cached = None
+
+    def _update_cos_sin_tables(self, x, seq_dimension=-2):
+        # expect input: B, H, L, D
+        seq_len = x.shape[seq_dimension]
+
+        # Reset the tables if the sequence length has changed,
+        # or if we're on a new device (possibly due to tracing for instance)
+        # also make sure dtype wont change
+        if (
+            seq_len != self._seq_len_cached or
+            self._cos_cached.device != x.device or
+            self._cos_cached.dtype != x.dtype
+        ):
+            self._seq_len_cached = seq_len
+            t = torch.arange(
+                x.shape[seq_dimension], device=x.device, dtype=torch.float32
+            )
+            freqs = torch.einsum("i,j->ij", t, self.inv_freq.to(x.dtype))
+            emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
+
+            self._cos_cached = emb.cos()[None, None, :, :].to(x.dtype)
+            self._sin_cached = emb.sin()[None, None, :, :].to(x.dtype)
+
+        return self._cos_cached, self._sin_cached
+
+    def forward(self, q, k):
+        self._cos_cached, self._sin_cached = self._update_cos_sin_tables(
+            q.float(), seq_dimension=-2
+        )
+        if k is not None:
+            return (
+                apply_rotary_pos_emb(
+                    q.float(), self._cos_cached, self._sin_cached
+                ).type_as(q),
+                apply_rotary_pos_emb(
+                    k.float(), self._cos_cached, self._sin_cached
+                ).type_as(k),
+            )
+        else:
+            return (
+                apply_rotary_pos_emb(
+                    q.float(), self._cos_cached, self._sin_cached
+                ).type_as(q), None
+            )
diff --git a/models/dit/span_mask.py b/models/dit/span_mask.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0832567c3e4dcc0c49fdd88dadff11c80d8e2a0
--- /dev/null
+++ b/models/dit/span_mask.py
@@ -0,0 +1,149 @@
+import numpy as np
+import torch
+from typing import Optional, Tuple
+
+
+def compute_mask_indices(
+    shape: Tuple[int, int],
+    padding_mask: Optional[torch.Tensor],
+    mask_prob: float,
+    mask_length: int,
+    mask_type: str = "static",
+    mask_other: float = 0.0,
+    min_masks: int = 0,
+    no_overlap: bool = False,
+    min_space: int = 0,
+) -> np.ndarray:
+    """
+    Computes random mask spans for a given shape
+
+    Args:
+        shape: the the shape for which to compute masks.
+            should be of size 2 where first element is batch size and 2nd is timesteps
+        padding_mask: optional padding mask of the same size as shape, which will prevent masking padded elements
+        mask_prob: probability for each token to be chosen as start of the span to be masked. this will be multiplied by
+            number of timesteps divided by length of mask span to mask approximately this percentage of all elements.
+            however due to overlaps, the actual number will be smaller (unless no_overlap is True)
+        mask_type: how to compute mask lengths
+            static = fixed size
+            uniform = sample from uniform distribution [mask_other, mask_length*2]
+            normal = sample from normal distribution with mean mask_length and stdev mask_other. mask is min 1 element
+            poisson = sample from possion distribution with lambda = mask length
+        min_masks: minimum number of masked spans
+        no_overlap: if false, will switch to an alternative recursive algorithm that prevents spans from overlapping
+        min_space: only used if no_overlap is True, this is how many elements to keep unmasked between spans
+    """
+
+    bsz, all_sz = shape
+    mask = np.full((bsz, all_sz), False)
+
+    # Convert mask_prob to a NumPy array
+    mask_prob = np.array(mask_prob)
+
+    # Calculate all_num_mask for each element in the batch
+    all_num_mask = np.floor(
+        mask_prob * all_sz / float(mask_length) + np.random.rand(bsz)
+    ).astype(int)
+
+    # Apply the max operation with min_masks for each element
+    all_num_mask = np.maximum(min_masks, all_num_mask)
+
+    mask_idcs = []
+    for i in range(bsz):
+        if padding_mask is not None:
+            sz = all_sz - padding_mask[i].long().sum().item()
+            num_mask = int(
+                # add a random number for probabilistic rounding
+                mask_prob * sz / float(mask_length) + np.random.rand()
+            )
+            num_mask = max(min_masks, num_mask)
+        else:
+            sz = all_sz
+            num_mask = all_num_mask[i]
+
+        if mask_type == "static":
+            lengths = np.full(num_mask, mask_length)
+        elif mask_type == "uniform":
+            lengths = np.random.randint(
+                mask_other, mask_length*2 + 1, size=num_mask
+            )
+        elif mask_type == "normal":
+            lengths = np.random.normal(mask_length, mask_other, size=num_mask)
+            lengths = [max(1, int(round(x))) for x in lengths]
+        elif mask_type == "poisson":
+            lengths = np.random.poisson(mask_length, size=num_mask)
+            lengths = [int(round(x)) for x in lengths]
+        else:
+            raise Exception("unknown mask selection " + mask_type)
+
+        if sum(lengths) == 0:
+            lengths[0] = min(mask_length, sz - 1)
+
+        if no_overlap:
+            mask_idc = []
+
+            def arrange(s, e, length, keep_length):
+                span_start = np.random.randint(s, e - length)
+                mask_idc.extend(span_start + i for i in range(length))
+
+                new_parts = []
+                if span_start - s - min_space >= keep_length:
+                    new_parts.append((s, span_start - min_space + 1))
+                if e - span_start - keep_length - min_space > keep_length:
+                    new_parts.append((span_start + length + min_space, e))
+                return new_parts
+
+            parts = [(0, sz)]
+            min_length = min(lengths)
+            for length in sorted(lengths, reverse=True):
+                lens = np.fromiter(
+                    (
+                        e - s if e - s >= length + min_space else 0
+                        for s, e in parts
+                    ),
+                    np.int,
+                )
+                l_sum = np.sum(lens)
+                if l_sum == 0:
+                    break
+                probs = lens / np.sum(lens)
+                c = np.random.choice(len(parts), p=probs)
+                s, e = parts.pop(c)
+                parts.extend(arrange(s, e, length, min_length))
+            mask_idc = np.asarray(mask_idc)
+        else:
+            min_len = min(lengths)
+            if sz - min_len <= num_mask:
+                min_len = sz - num_mask - 1
+
+            mask_idc = np.random.choice(sz - min_len, num_mask, replace=False)
+
+            mask_idc = np.asarray([
+                mask_idc[j] + offset for j in range(len(mask_idc))
+                for offset in range(lengths[j])
+            ])
+
+        mask_idcs.append(np.unique(mask_idc[mask_idc < sz]))
+    # min_len = min([len(m) for m in mask_idcs])
+    for i, mask_idc in enumerate(mask_idcs):
+        # if len(mask_idc) > min_len:
+        # mask_idc = np.random.choice(mask_idc, min_len, replace=False)
+        mask[i, mask_idc] = True
+
+    return torch.tensor(mask)
+
+
+if __name__ == '__main__':
+    mask = compute_mask_indices(
+        shape=[4, 500],
+        padding_mask=None,
+        mask_prob=[0.65, 0.5, 0.65, 0.65],
+        mask_length=10,
+        mask_type="static",
+        mask_other=0.0,
+        min_masks=1,
+        no_overlap=False,
+        min_space=0,
+    )
+    print(mask)
+    print(mask.sum(dim=1))
diff --git a/requirement.txt b/requirement.txt
new file mode 100644
index 0000000000000000000000000000000000000000..36737bc8bdaec34b084ff5886a8f37600f06210f
--- /dev/null
+++ b/requirement.txt
@@ -0,0 +1,11 @@
+gradio==5.44.1
+torch==2.3.0
+torchaudio==2.3.0
+librosa
+soundfile==0.13.1
+numpy==1.26.4
+requests==2.28.1
+tqdm==4.67.1
+einops==0.8.1
+diffusers==0.35.1
+alias_free_torch==0.0.6
\ No newline at end of file
diff --git a/stabilityai-stable-diffusion-2-1/scheduler/scheduler_config.json b/stabilityai-stable-diffusion-2-1/scheduler/scheduler_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..536b82b4e3c62c4898b4ac8725bc514f2a98f5de
--- /dev/null
+++ b/stabilityai-stable-diffusion-2-1/scheduler/scheduler_config.json
@@ -0,0 +1,14 @@
+{
+  "_class_name": "DDIMScheduler",
+  "_diffusers_version": "0.8.0",
+  "beta_end": 0.012,
+  "beta_schedule": "scaled_linear",
+  "beta_start": 0.00085,
+  "clip_sample": false,
+  "num_train_timesteps": 1000,
+  "prediction_type": "v_prediction",
+  "set_alpha_to_one": false,
+  "skip_prk_steps": true,
+  "steps_offset": 1,
+  "trained_betas": null
+}
diff --git a/utils/__pycache__/accelerate_utilities.cpython-310.pyc b/utils/__pycache__/accelerate_utilities.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8c605a3475f9e51b5ee0334105b5f07e68f75b1d
Binary files /dev/null and b/utils/__pycache__/accelerate_utilities.cpython-310.pyc differ
diff --git a/utils/__pycache__/audiotime_event.cpython-310.pyc b/utils/__pycache__/audiotime_event.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8261001c40ce7f6e30293add221ba7c8e7eecc98
Binary files /dev/null and b/utils/__pycache__/audiotime_event.cpython-310.pyc differ
diff --git a/utils/__pycache__/audiotime_event_merge.cpython-310.pyc b/utils/__pycache__/audiotime_event_merge.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9eddba4297a5a8cb352202a3c7ac8d887829cdca
Binary files /dev/null and b/utils/__pycache__/audiotime_event_merge.cpython-310.pyc differ
diff --git a/utils/__pycache__/audiotime_event_new.cpython-310.pyc b/utils/__pycache__/audiotime_event_new.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4cb72e4b63fa814972007eadca37b904d3b718d8
Binary files /dev/null and b/utils/__pycache__/audiotime_event_new.cpython-310.pyc differ
diff --git a/utils/__pycache__/config.cpython-310.pyc b/utils/__pycache__/config.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..35960a066c1728f47be4dc5bb34aa9542d57f0ad
Binary files /dev/null and b/utils/__pycache__/config.cpython-310.pyc differ
diff --git a/utils/__pycache__/diffsinger_utilities.cpython-310.pyc b/utils/__pycache__/diffsinger_utilities.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..32fe663324cb50dc21e242028664d458819460ca
Binary files /dev/null and b/utils/__pycache__/diffsinger_utilities.cpython-310.pyc differ
diff --git a/utils/__pycache__/filter_data.cpython-310.pyc b/utils/__pycache__/filter_data.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cd22bc6b77d1cee0b356317ac2241bb0f5073c16
Binary files /dev/null and b/utils/__pycache__/filter_data.cpython-310.pyc differ
diff --git a/utils/__pycache__/llm.cpython-310.pyc b/utils/__pycache__/llm.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..59a1862f7bfcae5bc77fe8fc6d864f07ac02f08a
Binary files /dev/null and b/utils/__pycache__/llm.cpython-310.pyc differ
diff --git a/utils/__pycache__/llm_xiapi.cpython-310.pyc b/utils/__pycache__/llm_xiapi.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..12619d242e4d5721459855251807fbc6d835de70
Binary files /dev/null and b/utils/__pycache__/llm_xiapi.cpython-310.pyc differ
diff --git a/utils/__pycache__/log_helper.cpython-310.pyc b/utils/__pycache__/log_helper.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..653ff16ba4b0ed67826f83e510c5a41204e649a7
Binary files /dev/null and b/utils/__pycache__/log_helper.cpython-310.pyc differ
diff --git a/utils/__pycache__/logging.cpython-310.pyc b/utils/__pycache__/logging.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bff87f124956fe7681bb5eb52dffdbea99c833f2
Binary files /dev/null and b/utils/__pycache__/logging.cpython-310.pyc differ
diff --git a/utils/__pycache__/logging.cpython-313.pyc b/utils/__pycache__/logging.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..660c158c3b26f04a0c768c859a19be65c8f4fda2
Binary files /dev/null and b/utils/__pycache__/logging.cpython-313.pyc differ
diff --git a/utils/__pycache__/lr_scheduler_utilities.cpython-310.pyc b/utils/__pycache__/lr_scheduler_utilities.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..23d865c2549b16fded100e464af53bc849726bfb
Binary files /dev/null and b/utils/__pycache__/lr_scheduler_utilities.cpython-310.pyc differ
diff --git a/utils/__pycache__/torch_utilities.cpython-310.pyc b/utils/__pycache__/torch_utilities.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..56c9a405867cd1d72c76d84b137fac30a2e1f6a8
Binary files /dev/null and b/utils/__pycache__/torch_utilities.cpython-310.pyc differ
diff --git a/utils/accelerate_utilities.py b/utils/accelerate_utilities.py
new file mode 100644
index 0000000000000000000000000000000000000000..99fc5aa3ad2700361e006799c6aac8119c9bb15a
--- /dev/null
+++ b/utils/accelerate_utilities.py
@@ -0,0 +1,13 @@
+from accelerate import Accelerator
+
+
+class AcceleratorSaveTrainableParams(Accelerator):
+    def get_state_dict(self, model, unwrap=True):
+        state_dict = super().get_state_dict(model, unwrap)
+        if hasattr(model, "param_names_to_save"):
+            param_names_to_save = model.param_names_to_save
+            return {
+                k: v
+                for k, v in state_dict.items() if k in param_names_to_save
+            }
+        return state_dict
diff --git a/utils/audio.py b/utils/audio.py
new file mode 100644
index 0000000000000000000000000000000000000000..350a9fe08e229a2f979f8090e314216c6b356739
--- /dev/null
+++ b/utils/audio.py
@@ -0,0 +1,58 @@
+import torch
+import torch.nn as nn
+import torchaudio
+
+
+class PadCrop(nn.Module):
+    def __init__(self, n_samples, randomize=True):
+        super().__init__()
+        self.n_samples = n_samples
+        self.randomize = randomize
+
+    def __call__(self, signal):
+        n, s = signal.shape
+        start = 0 if (
+            not self.randomize
+        ) else torch.randint(0,
+                             max(0, s - self.n_samples) + 1, []).item()
+        end = start + self.n_samples
+        output = signal.new_zeros([n, self.n_samples])
+        output[:, :min(s, self.n_samples)] = signal[:, start:end]
+        return output
+
+
+def set_audio_channels(audio, target_channels):
+    if target_channels == 1:
+        # Convert to mono
+        audio = audio.mean(1, keepdim=True)
+    elif target_channels == 2:
+        # Convert to stereo
+        if audio.shape[1] == 1:
+            audio = audio.repeat(1, 2, 1)
+        elif audio.shape[1] > 2:
+            audio = audio[:, :2, :]
+    return audio
+
+
+def prepare_audio(
+    audio, in_sr, target_sr, target_length, target_channels, device
+):
+
+    audio = audio.to(device)
+
+    if in_sr != target_sr:
+        resample_tf = torchaudio.transforms.Resample(in_sr,
+                                                     target_sr).to(device)
+        audio = resample_tf(audio)
+
+    audio = PadCrop(target_length, randomize=False)(audio)
+
+    # Add batch dimension
+    if audio.dim() == 1:
+        audio = audio.unsqueeze(0).unsqueeze(0)
+    elif audio.dim() == 2:
+        audio = audio.unsqueeze(0)
+
+    audio = set_audio_channels(audio, target_channels)
+
+    return audio
diff --git a/utils/audiotime_event_merge.py b/utils/audiotime_event_merge.py
new file mode 100644
index 0000000000000000000000000000000000000000..74c58afffe49e40248f32fdec03c55651470fb3f
--- /dev/null
+++ b/utils/audiotime_event_merge.py
@@ -0,0 +1,99 @@
+import json
+
+def get_event_synonyms():
+    file_path = "./utils/merge_content.json"
+    with open(file_path, "r", encoding="utf-8") as file:
+        data = json.load(file)
+    
+    result = {}
+    for item in data:
+        event = item.get("event")
+        phrases = item.get("phrases", [])
+        result[event] = phrases
+    
+    return result
+
+
+import random
+import re
+
+def replace_event_synonyms(caption, onset):
+    """
+    Replace event names in both caption(TCC) and onset(TDC) string with corresponding free text descriptions.
+
+    Args:
+        caption (str): Caption text containing event names.
+        onset (str): Onset string, formatted as "event__start-end--event2__start-end".
+
+    Returns:
+        new_caption (str): Caption with event names replaced by descriptions.
+        new_onset (str): Onset string with event names replaced by descriptions.
+
+    Notes:
+        - Synonyms are fetched using get_event_synonyms().
+        - For each event, a random synonym is chosen.
+        - All occurrences in caption (with correct pluralization) and onset are replaced.
+    """
+    event_pattern = r"([a-zA-Z_()\s]+?)__((?:[\d\.\-]+_?)+)(?=--|$)"
+    events = re.findall(event_pattern, onset)
+    synonyms_dict = get_event_synonyms()
+    replacements = {}
+    # Choose a random synonym for each unique event
+    for event_name, _ in events:
+        if event_name not in replacements:
+            candidates = synonyms_dict.get(event_name, [event_name])
+            replacements[event_name] = random.choice(candidates) 
+    # Replace event names in the onset string
+    new_onset = "--".join([
+        f"{replacements[event]}__{timestamps}"
+        for event, timestamps in events
+    ])
+    # Replace event names in the caption, handling plural forms and case
+    new_caption = caption
+    for orig, repl in replacements.items():
+        orig_space = orig.replace("_", " ")
+        repl_space = repl.replace("_", " ")
+
+        escaped_orig_space = re.escape(orig_space)
+
+        pattern = rf"(?<!\w){escaped_orig_space}(es|s)?(?!\w)"
+
+        new_caption = re.sub(
+            pattern,
+            lambda m: match_plural(m, repl_space),
+            new_caption,
+            flags=re.IGNORECASE
+        )
+    
+    return new_caption.capitalize(), new_onset
+
+def match_plural(match_obj, replacement):
+    """
+    Return replacement word with same plural suffix as the matched word.
+
+    Args:
+        match_obj (re.Match): Match object with possible plural suffix.
+        replacement (str): Replacement string for the event name.
+
+    Returns:
+        str: Replacement string with plural suffix preserved.
+    """
+
+    matched = match_obj.group(0)
+    suffix = match_obj.group(1) or ""  # Get plural suffix if present
+    base_replacement = replacement
+
+    # Preserve plural suffix ("s" or "es") from original word
+    return base_replacement + suffix
+
+if __name__ == "__main__":
+    onset = "wind_chime__0.78-2.78"
+    caption = "wind chime one times"
+
+    print("Original onset:", onset)
+    print("Original caption:", caption)
+    
+    caption, onset  = replace_event_synonyms(caption, onset)
+
+    print("Modified onset:", onset)
+    print("Modified caption:", caption)
\ No newline at end of file
diff --git a/utils/config.py b/utils/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..e8b44ea40a5654d3f53b2ed1ace302ff0a18f72c
--- /dev/null
+++ b/utils/config.py
@@ -0,0 +1,56 @@
+from pathlib import Path
+import sys
+from typing import Union
+import os
+
+import hydra
+import omegaconf
+from omegaconf import OmegaConf
+
+
+def multiply(*args):
+    result = 1
+    for arg in args:
+        result *= arg
+    return result
+
+
+def get_pitch_downsample_ratio(
+    autoencoder_config: dict, pitch_frame_resolution: float
+):
+    latent_frame_resolution = autoencoder_config[
+        "downsampling_ratio"] / autoencoder_config["sample_rate"]
+    return round(latent_frame_resolution / pitch_frame_resolution)
+
+
+def register_omegaconf_resolvers() -> None:
+    """
+    Register custom resolver for hydra configs, which can be used in YAML
+    files for dynamically setting values
+    """
+    omegaconf.OmegaConf.clear_resolvers()
+    omegaconf.OmegaConf.register_new_resolver("len", len, replace=True)
+    omegaconf.OmegaConf.register_new_resolver(
+        "multiply", multiply, replace=True
+    )
+    omegaconf.OmegaConf.register_new_resolver(
+        "get_pitch_downsample_ratio", get_pitch_downsample_ratio, replace=True
+    )
+
+
+def generate_config_from_command_line_overrides(
+    config_file: Union[str, Path]
+) -> omegaconf.DictConfig:
+    register_omegaconf_resolvers()
+
+    config_file = Path(config_file).resolve()
+    config_name = config_file.name.__str__()
+    config_path = config_file.parent.__str__()
+    config_path = os.path.relpath(config_path, Path(__file__).resolve().parent)
+
+    overrides = sys.argv[1:]
+    with hydra.initialize(version_base=None, config_path=config_path):
+        config = hydra.compose(config_name=config_name, overrides=overrides)
+    omegaconf.OmegaConf.resolve(config)
+
+    return config
diff --git a/utils/diffsinger_utilities.py b/utils/diffsinger_utilities.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d7d55426080aa495fe9212390599ff348ef4c69
--- /dev/null
+++ b/utils/diffsinger_utilities.py
@@ -0,0 +1,550 @@
+import six
+from pathlib import Path
+import re
+import json
+from collections import OrderedDict
+from typing import Union
+
+import numpy as np
+import librosa
+import torch
+
+PAD = "<pad>"
+EOS = "<EOS>"
+UNK = "<UNK>"
+SEG = "|"
+RESERVED_TOKENS = [PAD, EOS, UNK]
+NUM_RESERVED_TOKENS = len(RESERVED_TOKENS)
+PAD_ID = RESERVED_TOKENS.index(PAD)  # Normally 0
+EOS_ID = RESERVED_TOKENS.index(EOS)  # Normally 1
+UNK_ID = RESERVED_TOKENS.index(UNK)  # Normally 2
+
+F0_BIN = 256
+F0_MAX = 1100.0
+F0_MIN = 50.0
+F0_MEL_MIN = 1127 * np.log(1 + F0_MIN/700)
+F0_MEL_MAX = 1127 * np.log(1 + F0_MAX/700)
+
+
+def f0_to_coarse(f0):
+    is_torch = isinstance(f0, torch.Tensor)
+    f0_mel = 1127 * (1 +
+                     f0/700).log() if is_torch else 1127 * np.log(1 + f0/700)
+    f0_mel[f0_mel > 0
+          ] = (f0_mel[f0_mel > 0] -
+               F0_MEL_MIN) * (F0_BIN-2) / (F0_MEL_MAX-F0_MEL_MIN) + 1
+
+    f0_mel[f0_mel <= 1] = 1
+    f0_mel[f0_mel > F0_BIN - 1] = F0_BIN - 1
+    f0_coarse = (f0_mel +
+                 0.5).long() if is_torch else np.rint(f0_mel).astype(np.int)
+    assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (
+        f0_coarse.max(), f0_coarse.min()
+    )
+    return f0_coarse
+
+
+def norm_f0(
+    f0: Union[np.ndarray, torch.Tensor],
+    uv: Union[None, np.ndarray],
+    f0_mean: float,
+    f0_std: float,
+    pitch_norm: str = "log",
+    use_uv: bool = True
+):
+    is_torch = isinstance(f0, torch.Tensor)
+    if pitch_norm == 'standard':
+        f0 = (f0-f0_mean) / f0_std
+    if pitch_norm == 'log':
+        f0 = torch.log2(f0) if is_torch else np.log2(f0)
+    if uv is not None and use_uv:
+        f0[uv > 0] = 0
+    return f0
+
+
+def norm_interp_f0(
+    f0: Union[np.ndarray, torch.Tensor],
+    f0_mean: float,
+    f0_std: float,
+    pitch_norm: str = "log",
+    use_uv: bool = True
+):
+    is_torch = isinstance(f0, torch.Tensor)
+    if is_torch:
+        device = f0.device
+        f0 = f0.data.cpu().numpy()
+    uv = f0 == 0
+    f0 = norm_f0(f0, uv, f0_mean, f0_std, pitch_norm, use_uv)
+    if sum(uv) == len(f0):
+        f0[uv] = 0
+    elif sum(uv) > 0:
+        f0[uv] = np.interp(np.where(uv)[0], np.where(~uv)[0], f0[~uv])
+    uv = torch.as_tensor(uv).float()
+    f0 = torch.as_tensor(f0).float()
+    if is_torch:
+        f0 = f0.to(device)
+    return f0, uv
+
+
+def denorm_f0(
+    f0,
+    uv,
+    pitch_norm="log",
+    f0_mean=None,
+    f0_std=None,
+    pitch_padding=None,
+    min=None,
+    max=None,
+    use_uv=True
+):
+    if pitch_norm == 'standard':
+        f0 = f0*f0_std + f0_mean
+    if pitch_norm == 'log':
+        f0 = 2**f0
+    if min is not None:
+        f0 = f0.clamp(min=min)
+    if max is not None:
+        f0 = f0.clamp(max=max)
+    if uv is not None and use_uv:
+        f0[uv > 0] = 0
+    if pitch_padding is not None:
+        f0[pitch_padding] = 0
+    return f0
+
+
+def librosa_pad_lr(x, fshift, pad_sides=1):
+    '''compute right padding (final frame) or both sides padding (first and final frames)
+    '''
+    assert pad_sides in (1, 2)
+    # return int(fsize // 2)
+    pad = (x.shape[0] // fshift + 1) * fshift - x.shape[0]
+    if pad_sides == 1:
+        return 0, pad
+    else:
+        return pad // 2, pad//2 + pad%2
+
+
+def get_pitch(
+    wav_file: Union[str, Path], sample_rate: int, frame_shift: float
+):
+    import parselmouth
+    hop_size = int(frame_shift * sample_rate)
+    wav, _ = librosa.core.load(wav_file, sr=sample_rate)
+    # l_pad, r_pad = librosa_pad_lr(wav, hop_size, 1)
+    # wav = np.pad(wav, (l_pad, r_pad), mode='constant', constant_values=0.0)
+
+    latent_length = wav.shape[0] // hop_size
+    f0_min = 80
+    f0_max = 750
+    pad_size = 4
+
+    f0 = parselmouth.Sound(wav, sample_rate).to_pitch_ac(
+        time_step=frame_shift,
+        voicing_threshold=0.6,
+        pitch_floor=f0_min,
+        pitch_ceiling=f0_max
+    ).selected_array['frequency']
+    delta_l = latent_length - len(f0)
+    if delta_l > 0:
+        f0 = np.concatenate([f0, [f0[-1]] * delta_l], 0)
+    pitch_coarse = f0_to_coarse(f0)
+    return f0, pitch_coarse
+
+
+def remove_empty_lines(text):
+    """remove empty lines"""
+    assert (len(text) > 0)
+    assert (isinstance(text, list))
+    text = [t.strip() for t in text]
+    if "" in text:
+        text.remove("")
+    return text
+
+
+def is_sil_phoneme(p):
+    return not p[0].isalpha()
+
+
+def strip_ids(ids, ids_to_strip):
+    """Strip ids_to_strip from the end ids."""
+    ids = list(ids)
+    while ids and ids[-1] in ids_to_strip:
+        ids.pop()
+    return ids
+
+
+class TextEncoder(object):
+    """Base class for converting from ints to/from human readable strings."""
+    def __init__(self, num_reserved_ids=NUM_RESERVED_TOKENS):
+        self._num_reserved_ids = num_reserved_ids
+
+    @property
+    def num_reserved_ids(self):
+        return self._num_reserved_ids
+
+    def encode(self, s):
+        """Transform a human-readable string into a sequence of int ids.
+
+        The ids should be in the range [num_reserved_ids, vocab_size). Ids [0,
+        num_reserved_ids) are reserved.
+
+        EOS is not appended.
+
+        Args:
+        s: human-readable string to be converted.
+
+        Returns:
+        ids: list of integers
+        """
+        return [int(w) + self._num_reserved_ids for w in s.split()]
+
+    def decode(self, ids, strip_extraneous=False):
+        """Transform a sequence of int ids into a human-readable string.
+
+        EOS is not expected in ids.
+
+        Args:
+        ids: list of integers to be converted.
+        strip_extraneous: bool, whether to strip off extraneous tokens
+            (EOS and PAD).
+
+        Returns:
+        s: human-readable string.
+        """
+        if strip_extraneous:
+            ids = strip_ids(ids, list(range(self._num_reserved_ids or 0)))
+        return " ".join(self.decode_list(ids))
+
+    def decode_list(self, ids):
+        """Transform a sequence of int ids into a their string versions.
+
+        This method supports transforming individual input/output ids to their
+        string versions so that sequence to/from text conversions can be visualized
+        in a human readable format.
+
+        Args:
+        ids: list of integers to be converted.
+
+        Returns:
+        strs: list of human-readable string.
+        """
+        decoded_ids = []
+        for id_ in ids:
+            if 0 <= id_ < self._num_reserved_ids:
+                decoded_ids.append(RESERVED_TOKENS[int(id_)])
+            else:
+                decoded_ids.append(id_ - self._num_reserved_ids)
+        return [str(d) for d in decoded_ids]
+
+    @property
+    def vocab_size(self):
+        raise NotImplementedError()
+
+
+class TokenTextEncoder(TextEncoder):
+    """Encoder based on a user-supplied vocabulary (file or list)."""
+    def __init__(
+        self,
+        vocab_filename,
+        reverse=False,
+        vocab_list=None,
+        replace_oov=None,
+        num_reserved_ids=NUM_RESERVED_TOKENS
+    ):
+        """Initialize from a file or list, one token per line.
+
+        Handling of reserved tokens works as follows:
+        - When initializing from a list, we add reserved tokens to the vocab.
+        - When initializing from a file, we do not add reserved tokens to the vocab.
+        - When saving vocab files, we save reserved tokens to the file.
+
+        Args:
+            vocab_filename: If not None, the full filename to read vocab from. If this
+                is not None, then vocab_list should be None.
+            reverse: Boolean indicating if tokens should be reversed during encoding
+                and decoding.
+            vocab_list: If not None, a list of elements of the vocabulary. If this is
+                not None, then vocab_filename should be None.
+            replace_oov: If not None, every out-of-vocabulary token seen when
+                encoding will be replaced by this string (which must be in vocab).
+            num_reserved_ids: Number of IDs to save for reserved tokens like <EOS>.
+        """
+        super(TokenTextEncoder,
+              self).__init__(num_reserved_ids=num_reserved_ids)
+        self._reverse = reverse
+        self._replace_oov = replace_oov
+        if vocab_filename:
+            self._init_vocab_from_file(vocab_filename)
+        else:
+            assert vocab_list is not None
+            self._init_vocab_from_list(vocab_list)
+        self.pad_index = self._token_to_id[PAD]
+        self.eos_index = self._token_to_id[EOS]
+        self.unk_index = self._token_to_id[UNK]
+        self.seg_index = self._token_to_id[
+            SEG] if SEG in self._token_to_id else self.eos_index
+
+    def encode(self, s):
+        """Converts a space-separated string of tokens to a list of ids."""
+        sentence = s
+        tokens = sentence.strip().split()
+        if self._replace_oov is not None:
+            tokens = [
+                t if t in self._token_to_id else self._replace_oov
+                for t in tokens
+            ]
+        ret = [self._token_to_id[tok] for tok in tokens]
+        return ret[::-1] if self._reverse else ret
+
+    def decode(self, ids, strip_eos=False, strip_padding=False):
+        if strip_padding and self.pad() in list(ids):
+            pad_pos = list(ids).index(self.pad())
+            ids = ids[:pad_pos]
+        if strip_eos and self.eos() in list(ids):
+            eos_pos = list(ids).index(self.eos())
+            ids = ids[:eos_pos]
+        return " ".join(self.decode_list(ids))
+
+    def decode_list(self, ids):
+        seq = reversed(ids) if self._reverse else ids
+        return [self._safe_id_to_token(i) for i in seq]
+
+    @property
+    def vocab_size(self):
+        return len(self._id_to_token)
+
+    def __len__(self):
+        return self.vocab_size
+
+    def _safe_id_to_token(self, idx):
+        return self._id_to_token.get(idx, "ID_%d" % idx)
+
+    def _init_vocab_from_file(self, filename):
+        """Load vocab from a file.
+
+        Args:
+        filename: The file to load vocabulary from.
+        """
+        with open(filename) as f:
+            tokens = [token.strip() for token in f.readlines()]
+
+        def token_gen():
+            for token in tokens:
+                yield token
+
+        self._init_vocab(token_gen(), add_reserved_tokens=False)
+
+    def _init_vocab_from_list(self, vocab_list):
+        """Initialize tokens from a list of tokens.
+
+        It is ok if reserved tokens appear in the vocab list. They will be
+        removed. The set of tokens in vocab_list should be unique.
+
+        Args:
+        vocab_list: A list of tokens.
+        """
+        def token_gen():
+            for token in vocab_list:
+                if token not in RESERVED_TOKENS:
+                    yield token
+
+        self._init_vocab(token_gen())
+
+    def _init_vocab(self, token_generator, add_reserved_tokens=True):
+        """Initialize vocabulary with tokens from token_generator."""
+
+        self._id_to_token = {}
+        non_reserved_start_index = 0
+
+        if add_reserved_tokens:
+            self._id_to_token.update(enumerate(RESERVED_TOKENS))
+            non_reserved_start_index = len(RESERVED_TOKENS)
+
+        self._id_to_token.update(
+            enumerate(token_generator, start=non_reserved_start_index)
+        )
+
+        # _token_to_id is the reverse of _id_to_token
+        self._token_to_id = dict((v, k)
+                                 for k, v in six.iteritems(self._id_to_token))
+
+    def pad(self):
+        return self.pad_index
+
+    def eos(self):
+        return self.eos_index
+
+    def unk(self):
+        return self.unk_index
+
+    def seg(self):
+        return self.seg_index
+
+    def store_to_file(self, filename):
+        """Write vocab file to disk.
+
+        Vocab files have one token per line. The file ends in a newline. Reserved
+        tokens are written to the vocab file as well.
+
+        Args:
+        filename: Full path of the file to store the vocab to.
+        """
+        with open(filename, "w") as f:
+            for i in range(len(self._id_to_token)):
+                f.write(self._id_to_token[i] + "\n")
+
+    def sil_phonemes(self):
+        return [p for p in self._id_to_token.values() if not p[0].isalpha()]
+
+
+class TextGrid(object):
+    def __init__(self, text):
+        text = remove_empty_lines(text)
+        self.text = text
+        self.line_count = 0
+        self._get_type()
+        self._get_time_intval()
+        self._get_size()
+        self.tier_list = []
+        self._get_item_list()
+
+    def _extract_pattern(self, pattern, inc):
+        """
+        Parameters
+        ----------
+        pattern : regex to extract pattern
+        inc : increment of line count after extraction
+        Returns
+        -------
+        group : extracted info
+        """
+        try:
+            group = re.match(pattern, self.text[self.line_count]).group(1)
+            self.line_count += inc
+        except AttributeError:
+            raise ValueError(
+                "File format error at line %d:%s" %
+                (self.line_count, self.text[self.line_count])
+            )
+        return group
+
+    def _get_type(self):
+        self.file_type = self._extract_pattern(r"File type = \"(.*)\"", 2)
+
+    def _get_time_intval(self):
+        self.xmin = self._extract_pattern(r"xmin = (.*)", 1)
+        self.xmax = self._extract_pattern(r"xmax = (.*)", 2)
+
+    def _get_size(self):
+        self.size = int(self._extract_pattern(r"size = (.*)", 2))
+
+    def _get_item_list(self):
+        """Only supports IntervalTier currently"""
+        for itemIdx in range(1, self.size + 1):
+            tier = OrderedDict()
+            item_list = []
+            tier_idx = self._extract_pattern(r"item \[(.*)\]:", 1)
+            tier_class = self._extract_pattern(r"class = \"(.*)\"", 1)
+            if tier_class != "IntervalTier":
+                raise NotImplementedError(
+                    "Only IntervalTier class is supported currently"
+                )
+            tier_name = self._extract_pattern(r"name = \"(.*)\"", 1)
+            tier_xmin = self._extract_pattern(r"xmin = (.*)", 1)
+            tier_xmax = self._extract_pattern(r"xmax = (.*)", 1)
+            tier_size = self._extract_pattern(r"intervals: size = (.*)", 1)
+            for i in range(int(tier_size)):
+                item = OrderedDict()
+                item["idx"] = self._extract_pattern(r"intervals \[(.*)\]", 1)
+                item["xmin"] = self._extract_pattern(r"xmin = (.*)", 1)
+                item["xmax"] = self._extract_pattern(r"xmax = (.*)", 1)
+                item["text"] = self._extract_pattern(r"text = \"(.*)\"", 1)
+                item_list.append(item)
+            tier["idx"] = tier_idx
+            tier["class"] = tier_class
+            tier["name"] = tier_name
+            tier["xmin"] = tier_xmin
+            tier["xmax"] = tier_xmax
+            tier["size"] = tier_size
+            tier["items"] = item_list
+            self.tier_list.append(tier)
+
+    def toJson(self):
+        _json = OrderedDict()
+        _json["file_type"] = self.file_type
+        _json["xmin"] = self.xmin
+        _json["xmax"] = self.xmax
+        _json["size"] = self.size
+        _json["tiers"] = self.tier_list
+        return json.dumps(_json, ensure_ascii=False, indent=2)
+
+
+def read_duration_from_textgrid(
+    textgrid_path: Union[str, Path],
+    phoneme: str,
+    utterance_duration: float,
+):
+    ph_list = phoneme.split(" ")
+    with open(textgrid_path, "r") as f:
+        textgrid = f.readlines()
+    textgrid = remove_empty_lines(textgrid)
+    textgrid = TextGrid(textgrid)
+    textgrid = json.loads(textgrid.toJson())
+
+    split = np.ones(len(ph_list) + 1, np.float) * -1
+    tg_idx = 0
+    ph_idx = 0
+    tg_align = [x for x in textgrid['tiers'][-1]['items']]
+    tg_align_ = []
+    for x in tg_align:
+        x['xmin'] = float(x['xmin'])
+        x['xmax'] = float(x['xmax'])
+        if x['text'] in ['sil', 'sp', '', 'SIL', 'PUNC']:
+            x['text'] = ''
+            if len(tg_align_) > 0 and tg_align_[-1]['text'] == '':
+                tg_align_[-1]['xmax'] = x['xmax']
+                continue
+        tg_align_.append(x)
+    tg_align = tg_align_
+    tg_len = len([x for x in tg_align if x['text'] != ''])
+    ph_len = len([x for x in ph_list if not is_sil_phoneme(x)])
+    assert tg_len == ph_len, (tg_len, ph_len, tg_align, ph_list, textgrid_path)
+    while tg_idx < len(tg_align) or ph_idx < len(ph_list):
+        if tg_idx == len(tg_align) and is_sil_phoneme(ph_list[ph_idx]):
+            split[ph_idx] = 1e8
+            ph_idx += 1
+            continue
+        x = tg_align[tg_idx]
+        if x['text'] == '' and ph_idx == len(ph_list):
+            tg_idx += 1
+            continue
+        assert ph_idx < len(ph_list), (
+            tg_len, ph_len, tg_align, ph_list, textgrid_path
+        )
+
+        ph = ph_list[ph_idx]
+        if x['text'] == '' and not is_sil_phoneme(ph):
+            assert False, (ph_list, tg_align)
+        if x['text'] != '' and is_sil_phoneme(ph):
+            ph_idx += 1
+        else:
+            assert (x['text'] == '' and is_sil_phoneme(ph)) \
+                   or x['text'].lower() == ph.lower() \
+                   or x['text'].lower() == 'sil', (x['text'], ph)
+            split[ph_idx] = x['xmin']
+            if ph_idx > 0 and split[ph_idx - 1] == -1 and is_sil_phoneme(
+                ph_list[ph_idx - 1]
+            ):
+                split[ph_idx - 1] = split[ph_idx]
+            ph_idx += 1
+            tg_idx += 1
+    assert tg_idx == len(tg_align), (tg_idx, [x['text'] for x in tg_align])
+    assert ph_idx >= len(ph_list) - 1, (
+        ph_idx, ph_list, len(ph_list), [x['text']
+                                        for x in tg_align], textgrid_path
+    )
+
+    split[0] = 0
+    split[-1] = utterance_duration
+    duration = np.diff(split)
+    return duration
diff --git a/utils/llm_xiapi.py b/utils/llm_xiapi.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e42df84c249b375df9db9d9ed84d4eadbd91bd7
--- /dev/null
+++ b/utils/llm_xiapi.py
@@ -0,0 +1,69 @@
+import requests
+import json
+
+url = "https://api.xi-ai.cn/v1/chat/completions"
+headers = {
+    "Content-Type": "application/json",
+    "Authorization": "sk-iPNe4meYeEmknmgx7c31F953548148F7805555D1Cc0c506e"    #your api key
+}
+
+training_info_pri = """
+I'm doing an audio event generation, which is a harmless job that will contain some sound events. For example, a gunshot is a sound that is harmless.
+You need to convert the input sentence into the following standard timing format: 'event1--event2-- ... --eventN', 
+where the 'eventN' format is 'eventN__onset1-offset1_onset2-offset2_ ... _onsetK-offsetK'. 
+The 'onset-offset' inside needs to be determined based on common sense, with a duration not less than 1.  All format 'onsetk-offsetk' should replaced by number. 
+You need to make a prediction for the total duration, which should not exceed 20 seconds and not exceed the latest end time of a single event.
+And pay attention to vocabulary that represents the order and frequency of events, such as 'after', 'followed by', 'n times', and so on.
+You can use the latest ending event of all events in the training dataset as the total audio time
+It is preferred that events do not overlap as much as possible.
+Now, I will provide you with some examples in training set for your learning, each example in the format 'index: input~output'. 
+{"onset": "squeal__1.359-2.373_3.216-4.23_5.576-6.59", "captions": "squeal 3 times", "length": "7.52"}
+{"onset": "sawing__1.432-3.975_4.533-6.54", "captions": "sawing 2 times", "length": "9.26"}
+{"onset": "slap__1.576-1.931_2.911-3.266--baby_laughter__5.179-6.394_7.362-8.577", "captions": "slap 2 times and baby laughter 2 times", "length": "9.59"}
+{"onset": "applause__1.538-5.128--scrape__7.03-8.004", "captions": "applause and scrape", "length": "9.13"}
+{"onset": "slam__0.68-1.01--walk__2.364-4.107--busy_signal__6.794-7.222_8.371-8.645", "captions": "slam 1 times and walk 1 times and busy signal 2 times", "length": "9.18"}
+{"onset": "slap__1.044-1.399--neigh__2.654-4.663_5.633-6.966", "captions": "slap 1 times followed by neigh 2 times", "length": "9.22"}
+{"onset": "bird_vocalization__1.253-2.184--yip__4.789-5.309_6.134-6.654", "captions": "bird vocalization 1 times and yip 2 times", "length": "9.83"}
+{"onset": "animal__1.478-3.541--crowing__5.464-7.11", "captions": "animal then crowing", "length": "9.45"}
+{"onset": "crying__0.999-7.773", "captions": "crying", "length": "9.48"}
+{"onset": "cricket__1.629-4.983", "captions": "cricket 1 times", "length": "5.87"}
+{"onset": "fireworks__1.336-2.477--car__4.193-6.649", "captions": "car after fireworks", "length": "9.7"}
+"""
+training_info_post = """
+It is worth noting that you should judge both the duration of a single event and the total duration based on experience and the examples I provided. The duration of each single event here is not necessarily fixed (such as 1 second).
+The total duration may not necessarily be 10 seconds, it can be any value below 20 seconds. you should give me the answer as {"onset":" ","captions": " ", "length": " "}'
+"""
+
+def get_time_info(caption):
+    prompt = (
+        f"{training_info_pri}\n"
+        f'Now,you can transform "captions":\n'
+        f'"{caption}"\n'
+        f"{training_info_post}"
+    )
+    data = {
+        "model": "gpt-5-mini",
+        "stream": False,
+        "messages": [
+            {
+                "role": "system",
+                "content": "You are a helpful assistant."
+            },
+            {
+                "role": "user",
+                "content": prompt
+            }
+        ]
+    }
+    response = requests.post(url, headers=headers, json=data)
+    if response.status_code == 200:
+        print(response.json()['choices'][0]['message']['content'])
+        return response.json()['choices'][0]['message']['content']
+    else:
+        print(f"Error: {response.status_code}, {response.text}")
+        return None
+
+if __name__ == "__main__":
+    caption = "a dog barks followed by a cat meows 2 times"
+    result = get_time_info(caption)
+    print(result)
\ No newline at end of file
diff --git a/utils/log_helper.py b/utils/log_helper.py
new file mode 100644
index 0000000000000000000000000000000000000000..bcb011c02dca7c42c9b387c4f126ba3e9fe7efb4
--- /dev/null
+++ b/utils/log_helper.py
@@ -0,0 +1,23 @@
+from pathlib import Path
+from dataclasses import dataclass
+import logging
+
+
+@dataclass
+class LoggingLogger:
+
+    filename: str | Path
+    level: str = "INFO"
+
+    def create_instance(self, ):
+        filename = self.filename.__str__()
+        formatter = logging.Formatter("[%(asctime)s] - %(message)s")
+
+        logger = logging.getLogger(__name__ + "." + filename)
+        logger.setLevel(getattr(logging, self.level))
+
+        file_handler = logging.FileHandler(filename)
+        file_handler.setFormatter(formatter)
+        logger.addHandler(file_handler)
+
+        return logger
diff --git a/utils/lr_scheduler_utilities.py b/utils/lr_scheduler_utilities.py
new file mode 100644
index 0000000000000000000000000000000000000000..6bd0333a6c51bbc1f307213eced5d34b4b906d22
--- /dev/null
+++ b/utils/lr_scheduler_utilities.py
@@ -0,0 +1,154 @@
+from typing import Any
+import math
+import copy
+from torch.utils.data import DataLoader
+
+
+def get_warmup_steps(
+    dataloader_one_pass_outside_steps: int,
+    warmup_steps: int | None = None,
+    warmup_epochs: float | None = None,
+    epoch_length: int | None = None,
+) -> int:
+    """
+    Derive warmup steps according to step number or epoch number.
+    If `warmup_steps` is provided, then just return it. Otherwise, derive
+    the warmup steps by epoch length and warmup epoch number.
+    """
+    if warmup_steps is not None:
+        return warmup_steps
+    else:
+        if epoch_length is None:
+            epoch_length = dataloader_one_pass_outside_steps
+        assert warmup_epochs is not None, "warmup_steps and warmup_epochs cannot be both None"
+        return int(epoch_length * warmup_epochs)
+
+
+def get_dataloader_one_pass_outside_steps(
+    train_dataloader: DataLoader,
+    num_processes: int = 1,
+):
+    """
+    dataloader length after DDP, close to `original_length / gpu_number`
+    """
+    return math.ceil(len(train_dataloader) / num_processes)
+
+
+def get_total_training_steps(
+    train_dataloader: DataLoader,
+    epochs: int,
+    num_processes: int = 1,
+    epoch_length: int | None = None
+):
+    """
+    Calculate the total number of "visible" training steps.
+
+    If `epoch_length` is provided, it is used as the fixed length for each epoch.
+    Otherwise, the function will determine the epoch length from `train_dataloader`.
+
+    Args:
+        train_dataloader: 
+            Training dataloader object.
+        epochs: 
+            The total number of epochs to run.
+        num_processes: 
+            The number of parallel processes used for distributed training.
+        epoch_length: 
+            A fixed number of training steps for each epoch. Defaults to None.
+
+    Returns:
+        int: The total number of training steps (i.e., `epochs * epoch_length`).
+    """
+    # `epoch_length` is not None: fixed length for each epoch
+    if epoch_length is None:
+        # `epoch_length` is the length of DDP-wrapped `train_dataloader`
+        epoch_length = get_dataloader_one_pass_outside_steps(
+            train_dataloader, num_processes
+        )
+    return epochs * epoch_length
+
+
+def get_dataloader_one_pass_steps_inside_accelerator(
+    dataloader_one_pass_steps: int, gradient_accumulation_steps: int,
+    num_processes: int
+):
+    """
+    Calculate the number of "visible" training steps for a single pass over the dataloader
+    inside an accelerator, accounting for gradient accumulation and distributed training.
+
+
+    Args:
+        dataloader_one_pass_steps:
+            The number of steps (batches) in one pass over the dataset.
+        gradient_accumulation_steps:
+            The number of steps to accumulate gradients before performing a parameter update.
+        num_processes:
+            The number of parallel processes used for distributed training.
+
+    Returns:
+        int: The total number of "visible" training steps for one pass over the dataset,
+             multiplied by the number of processes.
+    """
+    return math.ceil(
+        dataloader_one_pass_steps / gradient_accumulation_steps
+    ) * num_processes
+
+
+def get_steps_inside_accelerator_from_outside_steps(
+    outside_steps: int, dataloader_one_pass_outside_steps: int,
+    dataloader_one_pass_steps_inside_accelerator: int,
+    gradient_accumulation_steps: int, num_processes: int
+):
+    """
+    Convert "outside" steps (as observed in wandb logger or similar context) 
+    to the corresponding number of "inside" steps (for accelerate lr scheduler).
+
+    Specifically, accelerate lr scheduler call `step()` `num_processes` times for
+    every `gradient_accumulation_steps` outside steps.
+
+    Args:
+        outside_steps:
+            The total number of steps counted outside accelerate context.
+        dataloader_one_pass_outside_steps:
+            The number of steps (batches) to complete one pass of the dataloader
+            outside accelerate.
+        dataloader_one_pass_steps_inside_accelerator:
+            The number of `lr_scheduler.step()` calls inside accelerate, calculated via
+            `get_dataloader_one_pass_steps_inside_accelerator`.
+        gradient_accumulation_steps:
+            The number of steps to accumulate gradients.
+        num_processes:
+            The number of parallel processes (GPUs) used in distributed training.
+
+    Returns:
+        int: The total number of `lr_scheduler.step()` calls inside accelerate that 
+        correspond to the given `outside_steps`.
+    """
+    num_dataloader_epochs_passed = outside_steps // dataloader_one_pass_outside_steps
+    remaining_outside_steps = outside_steps % dataloader_one_pass_outside_steps
+    remaining_inside_accelerator_steps = (
+        remaining_outside_steps // gradient_accumulation_steps * num_processes
+    )
+    # accelerate scheduler call `step()` `num_processes` times every
+    # `gradient_accumulation_steps` steps:
+    # https://github.com/huggingface/accelerate/blob/main/src/accelerate/scheduler.py#L76
+    total_steps = (
+        num_dataloader_epochs_passed*
+        dataloader_one_pass_steps_inside_accelerator +
+        remaining_inside_accelerator_steps
+    )
+    return total_steps
+
+
+def lr_scheduler_param_adapter(
+    config_dict: dict[str, Any], num_training_steps: int, num_warmup_steps: int
+) -> dict[str, Any]:
+    target_class = config_dict["_target_"]
+    return_dict = copy.deepcopy(config_dict)
+    if target_class == "transformers.get_scheduler":
+        return_dict.update({
+            "num_training_steps": num_training_steps,
+            "num_warmup_steps": num_warmup_steps
+        })
+
+    return return_dict
diff --git a/utils/merge_content.json b/utils/merge_content.json
new file mode 100644
index 0000000000000000000000000000000000000000..de31715833115d135c88298ab934a010e62e624b
--- /dev/null
+++ b/utils/merge_content.json
@@ -0,0 +1,6196 @@
+[
+    {
+        "event": "accelerating",
+        "phrases": [
+            "accelerating",
+            "accelerating revving",
+            "accelerating vehicle",
+            "accelerating engine",
+            "accelerating vehicle engine",
+            "vehicle motor accelerating and deaccelerating",
+            "increased revving",
+            "engine accelerating",
+            "accelerating and revving",
+            "an engine decelerates then accelerates",
+            "revving vehicle",
+            "accelerating a vehicle",
+            "a revving",
+            "reving",
+            "a series of accelerating",
+            "an engine revving and then slowing",
+            "accelerating motor",
+            "engine accelerating and deccelerating",
+            "accelerating engines",
+            "an engine gradually revs down",
+            "engine revving",
+            "engine accelerating rapidly",
+            "engine motor revving",
+            "engine acceleration",
+            "acceleration",
+            "engine accelerating then slowing down",
+            "an even stronger revving",
+            "accelerating car",
+            "a vehicle engine accelerating at a rapid rate",
+            "The engine roared as the vehicle picked up speed",
+            "A deep, powerful hum filled the air as acceleration began",
+            "The sound of the motor intensified as the vehicle surged forward",
+            "A sharp, rising whirring noise accompanied the vehicle's acceleration",
+            "A steady increase in engine noise signaled the vehicle speeding up"
+        ]
+    },
+    {
+        "event": "air_horn",
+        "phrases": [
+            "air horns",
+            "air horns blare in a series",
+            "air horn",
+            "airhorn",
+            "air horn sounds",
+            "horns sound",
+            "air horns alternate",
+            "air horns blare twice",
+            "air horn sound",
+            "air horns sound",
+            "air horns blow",
+            "a brief, loud horn",
+            "a horn blows in different patterns and tunes",
+            "air raid siren with delay",
+            "a sudden series of a horn blowing loudly",
+            "A loud, piercing blast echoed through the air",
+            "The deep, resonant honk of an air horn cut through the noise",
+            "A sudden blaring sound burst out, commanding attention",
+            "A booming horn sound reverberated with authority",
+            "The unmistakable blast of an air horn signaled its presence"
+        ]
+    },
+    {
+        "event": "aircraft",
+        "phrases": [
+            "aircraft",
+            "aircraft fly",
+            "aircraft motor",
+            "aircraft noise",
+            "aircraft engine",
+            "an aircraft approaches",
+            "aircraft flies",
+            "aircraft approach",
+            "aircrafts",
+            "an aircraft passes",
+            "fixed-wing aircraft fly overhead",
+            "an aircraft propeller",
+            "an aircraft flies by",
+            "an aircraft moves fast",
+            "airplane engine",
+            "an aircraft engine goes by",
+            "aircraft sounds",
+            "an aircraft engine roars by",
+            "an aircraft approaches and passes",
+            "a propeller aircraft flies by loudly",
+            "an aircraft engine stuttering",
+            "an aircraft engine swirls in the background",
+            "large aircraft taking off",
+            "an aircraft engine gets louder as it approaches",
+            "another aircraft engine passing by",
+            "an aircraft passes by loudly",
+            "an aircraft is performing a flyby",
+            "an aircraft engine",
+            "an aircraft engine approaching closer",
+            "A distant, steady hum of an aircraft filled the sky",
+            "The faint roar of engines could be heard overhead",
+            "A low-frequency rumble signified an aircraft passing by",
+            "The rhythmic drone of an aircraft traveled across the atmosphere",
+            "A continuous engine noise hovered in the background, marking flight"
+        ]
+    },
+    {
+        "event": "aircraft_engine",
+        "phrases": [
+            "aircraft engine sound",
+            "aircraft approach",
+            "airplane",
+            "air propellers",
+            "prop engine sound",
+            "airplanes",
+            "airoplane sound",
+            "a plane's engine starts",
+            "aircraft noise",
+            "aircraft",
+            "airplane starting roughly",
+            "aircraft play",
+            "plane takes off",
+            "helicopter",
+            "a voice of idling aircraft engine",
+            "aircraft fly",
+            "aircraft flies",
+            "engine of airplane taking off",
+            "jets",
+            "aircraft engine noise",
+            "airplane engine",
+            "A powerful, steady engine roar resonated as the aircraft operated",
+            "The deep and consistent hum of an aircraft engine was distinct",
+            "A mechanical whirring filled the air as the engine powered up",
+            "A steady droning noise emanated from the aircraft's engine",
+            "The unmistakable sound of an aircraft engine dominated the atmosphere"
+        ]
+    },
+    {
+        "event": "alarm",
+        "phrases": [
+            "alarm",
+            "an alarm",
+            "alarms",
+            "futuristic alarm sound",
+            "a warning alarm",
+            "alarm sound from an energy cube vault",
+            "an alarm sound is being synthesized with reverb and panning",
+            "a distress alarm",
+            "alarm sound made with feedback loop",
+            "a composite alarmsample is being looped",
+            "synthesized alarm sound",
+            "funky alarm sound",
+            "alarm blare",
+            "an alarm from space",
+            "an emergency alarm",
+            "a composite alarm sample is playing",
+            "security system alarm",
+            "synthesized alarm tone",
+            "alarm with repetition",
+            "an alarm is ringing in a media library",
+            "another alarm",
+            "an alarm for spaceships or robots",
+            "a shrill back-up alarm rings",
+            "setting off an alarm repeatedly",
+            "more alarms",
+            "an alarm-like soundscape",
+            "an alarm beeps loudly multiple times",
+            "A loud, urgent ringing sound pierced through the environment",
+            "The stark wail of an alarm signaled potential danger",
+            "A repetitive beeping noise filled the air as the alarm activated",
+            "An insistent, high-pitched alarm tone demanded immediate attention",
+            "The shrill sound of an alarm broke the silence"
+        ]
+    },
+    {
+        "event": "alarm_clock",
+        "phrases": [
+            "alarm clock",
+            "alarm clock is put down and continues playing",
+            "an alarm clock goes off repeatedly",
+            "an alarm clock goes off intermittently",
+            "alarm clocks",
+            "loud alarm clock sound effect",
+            "an alarm clock goes off with mechanical sounds",
+            "an alarm clock is heard repeatedly",
+            "alarm clocks buzz repetitively",
+            "an alarm clock goes off and later stops",
+            "an alarm clock is ringing repeatedly",
+            "an alarm clock beeping continuously",
+            "an alarm clock is ringing continuously",
+            "multiple alarm clocks",
+            "an alarm clock",
+            "an alarm is ringing continuously and repeatedly",
+            "alarm clocks are heard repeatedly",
+            "crossing alarm sounding",
+            "an alarm clock rings repeatedly",
+            "alarm clock is blaring loudly",
+            "an alarm clock beeps repeatedly",
+            "an alarm clock beeping a tune",
+            "a clock sounds an alarm",
+            "a wrist watch alarm",
+            "an alarm clock sounds intermittently",
+            "various alarms",
+            "an alarm is ringing from a clock radio",
+            "an alarm clock is ticking repeatedly",
+            "an alarm clock sounds repeatedly",
+            "alarm clock sounds",
+            "A sharp, repetitive beeping sound signaled the start of the day",
+            "A rhythmic ringing noise came from the alarm clock",
+            "The consistent chime of an alarm clock filled the room",
+            "A sudden burst of sound from the alarm clock broke the quiet",
+            "The insistent sound of an alarm clock urged action"
+        ]
+    },
+    {
+        "event": "ambulance_(siren)",
+        "phrases": [
+            "a two-tone emergency vehicle siren blows as it approaches, passes by, and then fades into the distance",
+            "a couple of sirens blaring one after the other",
+            "sirens ring as they approach",
+            "several sirens, sounding at different frequencies",
+            "a siren slowly gets louder as it approaches",
+            "an emergency vehicle siren blows, approaching and then fading",
+            "ambulance siren",
+            "an emergency vehicle blasts a series of sirens",
+            "an emergency vehicle siren wails and echoes",
+            "a siren getting louder as it approaches",
+            "an emergency vehicle approaches with its siren blaring",
+            "emergency sirens are blaring as they approach and pass by",
+            "loud siren gets closer and then gets further away",
+            "a series of emergency sirens sounding",
+            "loud, continuous ambulance sirens",
+            "several different ambulance sirens are triggered subsequently",
+            "an emergency vehicle siren wails in different patterns",
+            "an ambulance sounds its siren",
+            "ambulance sirens",
+            "different types of ambulance vehicle sirens are blaring in turn",
+            "an ambulance siren is triggered and moves to get closer",
+            "an ambulance siren is triggered and moves to get closer and then further",
+            "an ambulance siren receding as the ambulance drives away",
+            "an ambulance's siren",
+            "A wailing siren sound echoed in the distance",
+            "The oscillating pitch of a siren signaled an ambulance approaching",
+            "A high-pitched siren sound cut through the air, alerting others",
+            "The urgent, alternating tones of an ambulance siren were unmistakable",
+            "A rapid, piercing siren sound announced the ambulance's presence"
+        ]
+    },
+    {
+        "event": "animal",
+        "phrases": [
+            "animal noise",
+            "the sounds of an animal",
+            "animal sound",
+            "the sounds of animals",
+            "animal sound effects",
+            "an animal",
+            "animal sounds",
+            "animal sounds outside",
+            "animal contact sounds",
+            "an animal sound effect",
+            "animal calling",
+            "animal noises",
+            "animal sounds occur",
+            "animal roars",
+            "another animal",
+            "A series of chirping noises filled the environment",
+            "The low growl of an animal echoed softly",
+            "Distinct animal calls could be heard in the distance",
+            "Various animal vocalizations added a natural melody to the surroundings",
+            "The continuous chatter of animals created a lively ambiance"
+        ]
+    },
+    {
+        "event": "applause",
+        "phrases": [
+            "applause",
+            "applauding",
+            "applause from audience",
+            "applause erupts",
+            "small audience ovation",
+            "general applause",
+            "crowd applauses",
+            "canned applause",
+            "long applause",
+            "crowd applauds",
+            "crowd applause",
+            "audience applause",
+            "cheering",
+            "crowd applausing",
+            "crowd applaud",
+            "clapping audience",
+            "applause amid crowds",
+            "cheering with applause",
+            "applauses",
+            "audience applauds",
+            "applause breaks out",
+            "claps",
+            "crowd claps",
+            "enthusiastic applause",
+            "polite applause",
+            "clapping begins",
+            "applause sounds",
+            "clapping hands",
+            "audience gives applause",
+            "applause and clapping",
+            "A rhythmic clapping sound erupted from the crowd",
+            "The sound of multiple hands clapping echoed in the space",
+            "A burst of applause broke out, filling the air with energy",
+            "The collective clapping noise created a wave of enthusiasm",
+            "A loud ovation of clapping resonated through the area"
+        ]
+    },
+    {
+        "event": "artillery_fire",
+        "phrases": [
+            "artillery fire",
+            "artillery",
+            "artillery fires",
+            "artillery fire erupts",
+            "artillery starts firing",
+            "artillery guns fire",
+            "artillery firing",
+            "artillery fire booms",
+            "artillery rounds fire",
+            "artillery cannons firing",
+            "artillery cannons firing several times",
+            "artillery is launched",
+            "artillery fire goes off",
+            "artillery cannons firing several times with an echo",
+            "artillery sounds",
+            "artillery fire occurs",
+            "artillery is fired in the distance",
+            "artillery fire rings out",
+            "artillery fire sounds",
+            "artillery fire occurs once",
+            "artillery fires in the background",
+            "A loud, booming explosion echoed through the air",
+            "A deep, resounding blast marked the firing of artillery",
+            "A sharp, cracking sound accompanied the artillery discharge",
+            "The thunderous sound of artillery fire shook the surroundings",
+            "A distant rumble signaled the power of artillery in action"
+        ]
+    },
+    {
+        "event": "baby_cry",
+        "phrases": [
+            "infant cries",
+            "an infant cries continously",
+            "baby cry",
+            "an infant is crying loudly and persistently",
+            "an infant cries repeatedly and loudly",
+            "a baby is crying very deeply in way that reverberates",
+            "infants cry",
+            "a baby cries over and over",
+            "an infant crying consistently",
+            "cry",
+            "baby crying sounds",
+            "a baby crying repeatedly and loudly",
+            "an infant cries continuously",
+            "baby cries",
+            "a baby continuously cries",
+            "young infant crying hard",
+            "two infants cry together",
+            "a young infant cries for a short while",
+            "an infant is crying continuously",
+            "an infant cries funnily",
+            "baby upset",
+            "baby is crying during birth",
+            "an infant crying continuously",
+            "an infant cries loudly and harshly",
+            "an infant cries repeatedly",
+            "infant screaming",
+            "a baby crying unceasingly",
+            "an infant cries repeatedly and softly",
+            "a crying infant",
+            "a small infant cries repeatedly",
+            "A high-pitched, wailing sound of a distressed infant filled the air",
+            "The sharp, repetitive crying of a baby was unmistakable",
+            "A soft whimper escalated into full-blown crying",
+            "The piercing sound of an infant crying echoed through the space",
+            "A rhythmic bawling sound indicated a baby's need for attention"
+        ]
+    },
+    {
+        "event": "baby_laughter",
+        "phrases": [
+            "baby laughs",
+            "baby jabbering",
+            "baby laugh",
+            "infant laughs",
+            "infant jabbering",
+            "infant laugh",
+            "infants laughter",
+            "infant laughter",
+            "baby laughter",
+            "baby gurgling laughter",
+            "an infant laughs continuously",
+            "baby laughing sound",
+            "infant laughing sound",
+            "two infants laugh",
+            "a baby giggle sporadically",
+            "a baby laughs loudly and frequently",
+            "A series of high-pitched giggles from a baby rang out joyfully",
+            "The bubbly laughter of an infant filled the room with happiness",
+            "A soft, melodic chuckle from a baby brought a sense of warmth",
+            "A baby's contagious laughter spread through the air",
+            "The cheerful giggling of a baby created a lively atmosphere"
+        ]
+    },
+    {
+        "event": "bark",
+        "phrases": [
+            "bark",
+            "real dog bark",
+            "dog bark",
+            "dog barks",
+            "barks",
+            "a dog barks four times",
+            "dog barks twice",
+            "barking",
+            "two dogs bark",
+            "barking dog",
+            "a dog barks over and over",
+            "a dog bark echos",
+            "a dog barks multiple times",
+            "a dog barks sharply",
+            "a dog barks multiple times loudly nearby",
+            "a series of dog barks",
+            "a bark intermittently",
+            "dog barks loudly",
+            "aggressive dog barking",
+            "big dog barking",
+            "dogs bow-wow",
+            "an angry big dog is barking",
+            "a dog barks furiously",
+            "the dog barking four times",
+            "dog barking repeatedly",
+            "a dog barks several times",
+            "a dog barks quickly several times",
+            "dog barking",
+            "a dog barks urgently",
+            "the dog inside barks two deliberate barks",
+            "A sharp, loud bark echoed through the area",
+            "The repetitive woof of a dog broke the silence",
+            "A low growl transitioned into a firm bark",
+            "The distinct sound of a dog's bark signaled its presence",
+            "A series of short, sharp barks conveyed urgency"
+        ]
+    },
+    {
+        "event": "bee",
+        "phrases": [
+            "bee sounds",
+            "bees buzz",
+            "bees buzz sounds",
+            "bee sound",
+            "several bees fly",
+            "bees",
+            "bees are buzzing",
+            "bees buzzing",
+            "bees fly",
+            "a bee",
+            "bee sound",
+            "a bee is buzzing",
+            "a bee is flying",
+            "a bee flies",
+            "bees fly",
+            "bees are lightly buzzing",
+            "bee fly sound",
+            "bee flying sound",
+            "a few bees fly nearby",
+            "a few bees buzz around",
+            "bees swarm loudly nearby",
+            "bees buzzing faintly",
+            "bees flying faintly",
+            "bees fly in the distance",
+            "bees whining",
+            "bees are flying around",
+            "flying bees",
+            "A soft, continuous buzzing sound filled the air",
+            "The distinctive hum of bees at work was audible nearby",
+            "A faint buzzing grew louder as the bees approached",
+            "The rhythmic buzz of bees created a natural melody",
+            "The gentle hum of bees added a sense of activity to the surroundings"
+        ]
+    },
+    {
+        "event": "beep",
+        "phrases": [
+            "beep",
+            "bleep",
+            "a warning beep",
+            "a device beep",
+            "a quick beep",
+            "an electronic beep",
+            "broken beep",
+            "an electronic bleep beeping three times",
+            "quick beep",
+            "an electronic bleep beeping once",
+            "a mechanical beep",
+            "error beep",
+            "a large beep",
+            "item beeping",
+            "a beeping",
+            "digital beeping",
+            "a second beep",
+            "an electronic bleep beeping repeatedly",
+            "a small digital beep",
+            "longer beep",
+            "a short beep",
+            "a beep repeats multiple times",
+            "single beep",
+            "a loud digital beep",
+            "beep beep beep",
+            "beeps",
+            "a digital beep repeating",
+            "a digital beeping",
+            "a beep repeats",
+            "beeping",
+            "A short, high-pitched beep broke the stillness",
+            "The repetitive beeping of a machine was clearly audible",
+            "A single, sharp beep indicated a completed action",
+            "The consistent tone of a beep filled the background",
+            "An intermittent beeping sound captured attention"
+        ]
+    },
+    {
+        "event": "bell",
+        "phrases": [
+            "bell",
+            "bell ring",
+            "bell toll sound",
+            "station bell",
+            "bells",
+            "meditation bells",
+            "small bell",
+            "a bell ding",
+            "a hand bell is striking the note c#/db",
+            "single bell beat",
+            "bell effects",
+            "a single hand bell strike",
+            "ring bell sounds",
+            "a metal bell ding",
+            "a bell-like resonance with few overtones",
+            "a warning bell",
+            "bells ring outside",
+            "a loud bell",
+            "a hand bell is being struck with the note A",
+            "a bell rings",
+            "bell sounds",
+            "a bell chimes loudly",
+            "a warning bell dings",
+            "a bell",
+            "a bell rings out",
+            "bell rings",
+            "a bell sound",
+            "bell dings",
+            "bells ring out in a melody",
+            "A clear, resonant ringing sound of a bell reverberated",
+            "A soft chime of a bell broke the silence",
+            "The metallic ringing of a bell echoed through the surroundings",
+            "A single, crisp bell tone signified an event",
+            "The rhythmic tolling of a bell filled the air"
+        ]
+    },
+    {
+        "event": "bicycle_bell",
+        "phrases": [
+            "sound of a bike bell",
+            "a bike bell",
+            "bicycle bell",
+            "a bicycle bell",
+            "a bike bell rings",
+            "a bicycle bell is sounding off",
+            "bike bell is ringing",
+            "bike bell is being hit continuously",
+            "bike bell is ringing continuously",
+            "a bike bell is ringing",
+            "a bike bell ringing",
+            "a bicycle bell rings",
+            "a bicycle bell is being rung",
+            "a bicycle bell ringing through a crowded street",
+            "a bicycle bell dings",
+            "a bicycle bell rings twice",
+            "a bicycle bell is ringing",
+            "a bicycle bell ring",
+            "a bicycle bell being rung several times",
+            "a bike bell ringing on the street",
+            "a bicycle rings its bell",
+            "A sharp ding from a bicycle bell alerted nearby pedestrians",
+            "The cheerful chime of a bicycle bell rang out clearly",
+            "A repetitive dinging sound came from a bicycle bell",
+            "The distinct ring of a bicycle bell broke through the ambient noise",
+            "A crisp bicycle bell tone signaled its approach"
+        ]
+    },
+    {
+        "event": "bird",
+        "phrases": [
+            "bird",
+            "bird tweet",
+            "the sounds of birds",
+            "the sounds of bird calls",
+            "pet birds tweet",
+            "birds tweet",
+            "bird tweets",
+            "birds chip",
+            "a bird sound",
+            "bird squawk",
+            "bird call",
+            "birdchirps",
+            "singing of a bird",
+            "birds tweets",
+            "the chirping of a bird",
+            "bird song",
+            "bird tweeting",
+            "a group of birds chirp",
+            "chirping of a bird",
+            "a series of bird chirping",
+            "The melodic chirping of birds filled the morning air",
+            "A series of high-pitched bird calls echoed nearby",
+            "The natural warbling of birds created a peaceful ambiance",
+            "The rhythmic tweeting of birds was unmistakable",
+            "A soft flurry of bird calls added life to the surroundings"
+        ]
+    },
+    {
+        "event": "bird_flight",
+        "phrases": [
+            "birds are taking flight",
+            "birds flight",
+            "birds are flying",
+            "birds taking flight",
+            "birds flight sound",
+            "birds flying sound",
+            "sounds of birds flight",
+            "birds fly around",
+            "birds fly nearby",
+            "a bird is fluttering in flight",
+            "sound of bird flight",
+            "sound of bird flying",
+            "birds flap their wings in flight",
+            "birds are making flight sounds",
+            "the sounds of bird flight",
+            "a bird flying away",
+            "a bird flying off",
+            "several birds fly",
+            "A soft flapping sound marked a bird in motion",
+            "The faint rustling of wings was audible as the bird took off",
+            "A steady whooshing noise accompanied the bird's flight",
+            "The gentle fluttering of wings echoed softly",
+            "The rhythmic beat of wings signified a bird in flight"
+        ]
+    },
+    {
+        "event": "bird_vocalization",
+        "phrases": [
+            "finch sound",
+            "a bird song phrase",
+            "calls from a single bird",
+            "a bird making a call",
+            "bird",
+            "birds outside",
+            "a bird tweets sharply",
+            "bird vocalizations outside",
+            "a bird calling three times",
+            "a bird song is playing",
+            "bird singing",
+            "bird songs",
+            "loud bird song",
+            "a bird is tweeting a bird song",
+            "a sweet bird song",
+            "a bird is chirping",
+            "a bird vocalizes repeatedly",
+            "a bird vocalizes",
+            "a drumming bird call",
+            "bird calling out",
+            "various bird calls",
+            "A melodious bird song resonated through the air",
+            "The chirping and tweeting of birds created a harmonious melody",
+            "A complex series of bird vocalizations filled the environment",
+            "The melodic trills and whistles of a bird added charm to the surroundings",
+            "The varied calls of birds blended into a soothing soundscape"
+        ]
+    },
+    {
+        "event": "bleat",
+        "phrases": [
+            "bleat",
+            "bleats",
+            "livestock bleat",
+            "goat bleeping",
+            "bleating",
+            "lamb bleating",
+            "an animal bleats three times",
+            "an animal bleat at a constant pace",
+            "a sheep bleats two times",
+            "a sheep baa baa",
+            "sheep baah",
+            "sheep bleat by her",
+            "goat bleat",
+            "goat bleating",
+            "repetitive bleating of a goat",
+            "an animal bleats loudly",
+            "animal bleats",
+            "goat bleeting",
+            "sheep bleat",
+            "the sheep bleat",
+            "sheep baaing",
+            "sheep",
+            "sheep baa",
+            "goat crying out",
+            "a sheep bleats a couple of times",
+            "the bleating cry of a sheep",
+            "animal bleating",
+            "The soft, repetitive bleating of a sheep echoed in the distance",
+            "A high-pitched bleat signaled the presence of a goat",
+            "The distinct bleating of livestock added a rural ambiance",
+            "A rhythmic bleat carried through the air, filling the surroundings",
+            "A lone bleat cut through the quiet, attracting attention"
+        ]
+    },
+    {
+        "event": "boing",
+        "phrases": [
+            "boing",
+            "a \"boing\"",
+            "a \"boing\" sound",
+            "boing sound",
+            "boing sound effect",
+            "boing sound effects",
+            "boinging",
+            "a boing",
+            "a boing occurs",
+            "boings",
+            "a boing sound",
+            "doorbell rings",
+            "a dong sounds",
+            "A sharp, springy boing sound echoed playfully",
+            "The resonant boing of a bouncing object was clear",
+            "A tonal, upward-sweeping boing sound filled the space",
+            "The unique spring-like boing sound was unmistakable",
+            "A quick boing sound broke the silence, adding a playful touch"
+        ]
+    },
+    {
+        "event": "breaking",
+        "phrases": [
+            "breaking",
+            "breaking sound",
+            "a breaking sound",
+            "something breaking",
+            "something breaks",
+            "things break",
+            "things break nearby",
+            "something is breaking",
+            "a sound of something breaking",
+            "the sound of things breaking",
+            "shatter",
+            "shatter sound",
+            "a shatter sound",
+            "something shatters",
+            "things shatter nearby",
+            "a sound of something shattering",
+            "A sharp, cracking noise signaled something breaking",
+            "The sound of shattering filled the air as an object broke apart",
+            "A loud snap followed by a crunch indicated breaking",
+            "The distinct cracking noise of breaking material was unmistakable",
+            "A sudden, loud break was heard as the object fractured"
+        ]
+    },
+    {
+        "event": "breathing",
+        "phrases": [
+            "breathing",
+            "breathing heavy",
+            "breaths",
+            "breathing intermittently",
+            "breathing noise",
+            "breathing sound",
+            "a breathing",
+            "breath blowing",
+            "breathes",
+            "breathing heavily",
+            "breathing audible",
+            "someone is breathing in and out close to the microphone",
+            "breathing like a breathing apparatus",
+            "breathing sounds",
+            "someone is breathing after holding breath",
+            "someone is breathing loudly in and out",
+            "brief heaving breathing",
+            "a person is breathing loudly and deeply",
+            "breathing with surface contact",
+            "human breathing",
+            "breathing is audible",
+            "someone heavily breathing close to the microphone",
+            "breath sounds",
+            "someone is breathing in and out",
+            "a person breathes moderately",
+            "breathing in between",
+            "man breathing",
+            "deep breathing",
+            "labored breathing",
+            "breathing over surface contact",
+            "A soft, rhythmic breathing sound was clearly audible",
+            "The steady inhale and exhale of breath filled the space",
+            "A faint, labored breathing noise indicated exertion",
+            "The sound of quickened breathing suggested heightened activity",
+            "A calm, slow breathing sound created a sense of relaxation"
+        ]
+    },
+    {
+        "event": "burping",
+        "phrases": [
+            "burping",
+            "burps",
+            "burping noises one after another",
+            "burping sounds",
+            "burping occurs",
+            "burping noises",
+            "burping sound coming from a person",
+            "non-distorted burps",
+            "several burping noises one after another",
+            "burping occurs repeatedly",
+            "human burping",
+            "someone burps for a few seconds in a row",
+            "repetitive burping",
+            "a series of deep burping noises one after another",
+            "burping noise",
+            "a short burp",
+            "a series of burping noises",
+            "a brief short burp",
+            "a burp occurs",
+            "a person burps loudly for a long time nearby",
+            "burping takes place",
+            "loud, long burps",
+            "a person burps loudly and steadily",
+            "a series of burps one after another",
+            "a person burbs for a period",
+            "a man burps for a time",
+            "A low, guttural burp interrupted the quiet",
+            "The sound of a sudden burp was unmistakable",
+            "A loud belch resonated briefly in the air",
+            "A short, sharp burping noise was heard in the background",
+            "The deep, rumbling sound of a burp added a humorous note"
+        ]
+    },
+    {
+        "event": "bus",
+        "phrases": [
+            "bus",
+            "a bus",
+            "bus sound",
+            "a bus passes",
+            "a bus passing by",
+            "a bus driving off",
+            "a bus driving",
+            "bus driving sound",
+            "bus moving",
+            "The steady hum of a bus engine filled the interior",
+            "A low rumbling sound accompanied the movement of the bus",
+            "The hiss of air brakes punctuated the bus's stop at a station",
+            "The rhythmic clatter of wheels on the road marked the bus's journey",
+            "A faint creaking noise came from the bus's suspension as it turned"
+        ]
+    },
+    {
+        "event": "busy_signal",
+        "phrases": [
+            "busy signal",
+            "busy signal sounds",
+            "busy signals",
+            "a telephone busy signal sounds",
+            "busy signals from a phone",
+            "a busy signal beeps",
+            "a busy signal ringing",
+            "a telephone is ringing with a busy signal",
+            "busy signals during telephone calls",
+            "busy signals play",
+            "A repetitive tone rang out over the phone line",
+            "The steady, rhythmic beeping of a busy signal filled the receiver",
+            "A monotone, pulsing sound indicated the line was engaged",
+            "The consistent beep of a busy tone was unmistakable",
+            "A sharp, repeating tone signaled the failure to connect"
+        ]
+    },
+    {
+        "event": "buzz",
+        "phrases": [
+            "buzz",
+            "buzzing",
+            "a buzz sound",
+            "buzzing noise",
+            "a short buzz",
+            "a buzz sounds",
+            "buzzing in stereo",
+            "it buzzes",
+            "buzzings",
+            "buzzes",
+            "buzzing occurs continuously",
+            "buzzing sound",
+            "a long buzz sounds",
+            "a buzz",
+            "buzz sounds",
+            "buzz noises",
+            "buzzing sounds",
+            "a quick buzz",
+            "buzzing vibrations",
+            "buzzing sounds produces",
+            "buzzing noises",
+            "another buzz",
+            "buzzing occurs",
+            "A rapid buzzing sound filled the air as something vibrated",
+            "The faint hum of a flying insect was audible nearby",
+            "A sharp, continuous buzz broke the silence",
+            "The background was filled with the low, vibrating buzz of motion",
+            "The rhythmic buzzing of wings created a natural vibration in the air"
+        ]
+    },
+    {
+        "event": "buzzer",
+        "phrases": [
+            "buzzer",
+            "buzzers",
+            "buzzer sound repeatedly in a series",
+            "buzzer sounds",
+            "a buzzer goes off",
+            "buzzers sound",
+            "a buzzer rings",
+            "a buzzer sound",
+            "buzzer is speaking",
+            "a buzzer going off",
+            "buzzers sound repeatedly",
+            "a buzzer sounds",
+            "a buzzer horn",
+            "an apartment buzzer is ringing",
+            "A loud, steady buzzing sound emitted from the device",
+            "The sharp tone of a buzzer signaled an alert",
+            "A mechanical buzzing noise indicated the activation of a signal",
+            "The high-pitched buzz of a warning device filled the room",
+            "A continuous, oscillating buzzer sound captured attention"
+        ]
+    },
+    {
+        "event": "camera",
+        "phrases": [
+            "camera",
+            "camera shutter",
+            "camera sound",
+            "single-lens reflex camera sounds",
+            "single-lens reflex camera",
+            "camera handling sound",
+            "camera mechanisms",
+            "camera flashes",
+            "camera interaction",
+            "camera effects",
+            "camera sounds",
+            "camera muffling",
+            "camera noise",
+            "cameras snapping",
+            "rustling with a camera",
+            "camera zooms",
+            "camera rattling",
+            "cameras taking pictures",
+            "camera clicking",
+            "camera tapping noise",
+            "a camera shot",
+            "A quick shutter click marked the capture of a photograph",
+            "A soft whirring sound accompanied the camera's autofocus",
+            "The distinct mechanical sound of a camera's shutter was heard",
+            "A brief beep indicated the camera was ready to shoot",
+            "The faint winding noise of a film camera added a nostalgic touch"
+        ]
+    },
+    {
+        "event": "car",
+        "phrases": [
+            "car",
+            "the sounds of cars",
+            "car sound",
+            "vehicle sound",
+            "the sound of cars",
+            "a car sound",
+            "car sounds",
+            "the sound of car",
+            "cars",
+            "vehicle move",
+            "a vehicle engine turns over",
+            "a car",
+            "car sound effect",
+            "car motor turning",
+            "car starts",
+            "vehicle sounds",
+            "a car passes by quickly",
+            "a car making vroom sounds",
+            "drive-by sound of a passenger car",
+            "The low purr of a car engine idling filled the space",
+            "A steady rumble accompanied the car's movement",
+            "The hiss of tires on the road was clearly audible",
+            "A faint clicking sound came from the car's turn signal",
+            "The rhythmic thud of the engine marked the car's operation"
+        ]
+    },
+    {
+        "event": "car_alarm",
+        "phrases": [
+            "car alarm",
+            "a car alarm is being set off and reset",
+            "a car alarm goes off repeatedly",
+            "a car alarm",
+            "a car alarm goes off",
+            "a car alarm is repeatedly sounding",
+            "car alarms",
+            "a car alarm is sounding repeatedly",
+            "a car alarm going off",
+            "cars alarm",
+            "a car sounds its alarm",
+            "a car alarm is being set",
+            "the sound of a car alarm",
+            "beeping of a car alarm",
+            "a car alarm beeps loudly",
+            "an alarm sounds on a motor vehicle",
+            "sound of a car alarm",
+            "a car alarm disarming beep",
+            "a car alarm is beeping",
+            "a car alarm is going off",
+            "a vehicle alarm",
+            "a car alarm blares",
+            "car alarms beep",
+            "a car alarm beeps",
+            "a car alarm ringing",
+            "a quick car alarm goes off",
+            "a car alarm is being replicated",
+            "a car alarm sounding",
+            "A loud, repetitive alarm tone blared from the car",
+            "The sharp, alternating siren of a car alarm filled the area",
+            "A high-pitched wailing sound came from the vehicle's alarm system",
+            "The insistent beeping of a car alarm signaled intrusion",
+            "A piercing alarm noise echoed as the security system activated"
+        ]
+    },
+    {
+        "event": "car_passing_by",
+        "phrases": [
+            "drive-by sound of a passenger car",
+            "vehicle pass by outside",
+            "a vehicle passed by and accelerates quickly",
+            "car sound",
+            "a car passes by quickly",
+            "a fast car moving away",
+            "a car passes by",
+            "vehicle sound",
+            "a car zooms by",
+            "vehicle driving away quickly",
+            "The low rumble of a car grew louder as it approached",
+            "A brief whooshing noise marked the car speeding past",
+            "The sound of tires on the pavement faded into the distance",
+            "A Doppler-shifted engine noise indicated movement past the listener",
+            "The rhythmic clatter of a car passing by was noticeable"
+        ]
+    },
+    {
+        "event": "cat",
+        "phrases": [
+            "cat",
+            "a cat meows",
+            "a cat hiss",
+            "a pet cat meows",
+            "a pet cat sound",
+            "a cat is growling",
+            "a cat purrs",
+            "a cat meowing in response",
+            "a cat meows and growls",
+            "a cat singing",
+            "cat sound",
+            "A soft purring sound came from the cat",
+            "The sharp meow of a feline was clearly audible",
+            "A low growling sound indicated the cat's displeasure",
+            "The rhythmic chirping of a cat added curiosity to its demeanor",
+            "The faint sound of a cat's paw scratching was heard nearby"
+        ]
+    },
+    {
+        "event": "chainsaw",
+        "phrases": [
+            "chainsaw",
+            "chainsaw cutting",
+            "chainsaw operate",
+            "a chainsaw runs cutting an object",
+            "a chainsaw runs before coming to an idle briefly",
+            "chainsaw running",
+            "chainsaws",
+            "chainsaw intermittent rev down",
+            "a chainsaw cutting and revving",
+            "chainsaw being run",
+            "a chainsaw slows down and revs again",
+            "chainsaw revs continuously",
+            "a chainsaw starting and revving",
+            "an electric chainsaw is turning on and off",
+            "a chainsaw motor",
+            "a chainsaw cutting",
+            "a chainsaw revving",
+            "a chainsaw operating and cutting through an object",
+            "a chainsaw cuts",
+            "a chainsaw engine running and revving up",
+            "a chainsaw is used and revved multiple times",
+            "a chainsaw revving sporadically",
+            "a chainsaw",
+            "a chainsaw revving up and down",
+            "chainsaw cutting wood",
+            "a chainsaw is started and begins cutting a solid object",
+            "a chainsaw runs and then stops",
+            "A loud, roaring sound indicated the chainsaw was in use",
+            "The rhythmic buzzing of the chainsaw filled the air",
+            "A high-pitched whine came from the chainsaw's blade cutting through material",
+            "The mechanical growl of the chainsaw was unmistakable",
+            "A sharp, grinding noise accompanied the chainsaw's operation"
+        ]
+    },
+    {
+        "event": "cheering",
+        "phrases": [
+            "cheering",
+            "cheering crowd",
+            "cheering crowds",
+            "cheering amid crowds",
+            "cheering with applause",
+            "cheering together",
+            "celebrations",
+            "crowd cheering",
+            "cheering at an event",
+            "cheers",
+            "cheering continues to come from the crowd",
+            "cheering with shouting",
+            "cheer",
+            "crowd cheers",
+            "crowd celebrations",
+            "background cheer",
+            "A loud, enthusiastic cheer erupted from the crowd",
+            "The sound of clapping and shouting created an energetic atmosphere",
+            "A rhythmic chant of cheering voices echoed across the area",
+            "The joyous sound of applause and cheering filled the air",
+            "A wave of cheering voices surged with excitement"
+        ]
+    },
+    {
+        "event": "child_singing",
+        "phrases": [
+            "a child singing voice",
+            "a child sings",
+            "a kid sings",
+            "a child singing",
+            "a kid singing",
+            "children sing",
+            "a child's singing",
+            "children's vocals",
+            "a child chants",
+            "child singing",
+            "a child is singing repeatedly",
+            "the child sings",
+            "children are singing in call and response",
+            "an older child singling",
+            "a child sound",
+            "a kid is singing",
+            "A soft, melodic voice of a child sang a tune",
+            "The high-pitched singing of a child filled the room",
+            "A faint humming noise accompanied the child's singing",
+            "The cheerful, rhythmic singing of a child was unmistakable",
+            "A gentle lullaby-like singing came from the child"
+        ]
+    },
+    {
+        "event": "child_speech",
+        "phrases": [
+            "kid speech",
+            "child speech",
+            "toddler speaking",
+            "child's speech",
+            "children's speech",
+            "children speech",
+            "child speeches",
+            "a speaking child",
+            "kids speech",
+            "children saying goodbye",
+            "a kid speaks",
+            "a child speaks",
+            "a child speak",
+            "young child speaking",
+            "kid speaking",
+            "a young child speaks loudly",
+            "a young child speaks",
+            "a child answers",
+            "a child is speaking phrases",
+            "a young child speaking",
+            "a child speaking",
+            "child speaks",
+            "a kid's voice",
+            "a young child is making a speech",
+            "a kid talk",
+            "a young kid speaks",
+            "child speaking",
+            "a child says words",
+            "A high-pitched voice uttered words in a child's tone",
+            "The soft chatter of a child was heard in the background",
+            "A cheerful, energetic voice marked the child's speech",
+            "The rhythmic articulation of a child speaking was audible",
+            "The playful tone of a child added liveliness to the surroundings"
+        ]
+    },
+    {
+        "event": "chime",
+        "phrases": [
+            "chime",
+            "a mystery chime",
+            "a short chime",
+            "chime accompaniment",
+            "a sound for a positive event in a game",
+            "a shining sound effect",
+            "a bell chime",
+            "a sound effect signaling a transition or completion",
+            "a chime",
+            "A soft, melodic chime rang out clearly",
+            "The resonant sound of chimes filled the air",
+            "A brief, high-pitched chime marked the passage of time",
+            "The rhythmic ringing of chimes created a soothing ambiance",
+            "The gentle tone of a chime added a musical touch to the environment"
+        ]
+    },
+    {
+        "event": "chirp",
+        "phrases": [
+            "chirp",
+            "bird tweet",
+            "chirp tone",
+            "edited bird chirp sounds",
+            "electronic sound effect",
+            "brids chirp",
+            "pet birds tweet",
+            "birds chip",
+            "a bird tweets sharply",
+            "bird vocalizations outside",
+            "a bird calling three times",
+            "a bird song is playing",
+            "bird singing",
+            "bird songs",
+            "loud bird song",
+            "a bird is tweeting a bird song",
+            "a sweet bird song",
+            "a bird is chirping",
+            "a bird vocalizes repeatedly",
+            "a bird vocalizes",
+            "a drumming bird call",
+            "chirps",
+            "A quick, high-pitched chirp came from a small bird",
+            "The rhythmic chirping of birds was clearly audible",
+            "A brief, melodic chirp broke the silence",
+            "A soft chirp added a natural element to the soundscape",
+            "The continuous chirping of small birds created a lively atmosphere"
+        ]
+    },
+    {
+        "event": "civil_defense_siren",
+        "phrases": [
+            "civil defense siren",
+            "a long drawn-out siren, tapering off at the end",
+            "a civil defense siren sounds",
+            "a european siren approaches",
+            "a civil defense siren blaring and winding down",
+            "a defense siren sounds",
+            "a siren blasts close by",
+            "a civil defense siren blares",
+            "a civil defense siren is ringing",
+            "a civil defense siren is going off",
+            "a civil defense siren blares loudly in the distance",
+            "a civil defense siren blares loudly",
+            "a civil defense siren is blaring",
+            "a civil defense siren blares in the distance",
+            "a civil defense siren blow",
+            "a civil defense siren is sounding",
+            "A loud, wailing siren echoed across the area, signaling an emergency",
+            "The oscillating tone of a civil defense siren filled the air",
+            "A high-pitched, rising and falling siren sound warned of impending danger",
+            "The continuous blare of a siren created an atmosphere of urgency",
+            "The distinct sound of a warning siren was unmistakable"
+        ]
+    },
+    {
+        "event": "clang",
+        "phrases": [
+            "clang",
+            "metal clang",
+            "a metallic clang",
+            "metal clank",
+            "a loud metal clang",
+            "metallic percussion hit",
+            "a louder metal clang",
+            "a clang sound",
+            "clangs",
+            "a metal hit",
+            "a metallic object hits",
+            "a metal clang",
+            "metal hits",
+            "metal is being hit by a hammer and ringing",
+            "a loud metal clank",
+            "metal clink",
+            "a cling",
+            "a metallic clank",
+            "A loud metallic clang resonated as the object was struck",
+            "The sharp, echoing clang of metal filled the space",
+            "A sudden, discordant clang broke the silence",
+            "The resonant clang of hollow metal rang out clearly",
+            "The deep, vibrating clang of a struck metal structure was audible"
+        ]
+    },
+    {
+        "event": "clapping",
+        "phrases": [
+            "clapping",
+            "clapping hands",
+            "clapping begins",
+            "clapping ensues",
+            "applauding and clapping",
+            "hands clapping loudly",
+            "hands clapping in applause",
+            "clapping at an event",
+            "applause and clapping",
+            "clapping and applause",
+            "clapping and applause sounds",
+            "mid frequency applause",
+            "applause being given",
+            "applauding",
+            "clapping noises",
+            "a smattering of applause",
+            "mid frequency applauding",
+            "clapping sounds",
+            "clapping from group of people",
+            "a crowd makes applause noises",
+            "an audience applauds continuously",
+            "clapping occurs",
+            "clapping takes place",
+            "loud clapping",
+            "a loud chorus of clapping",
+            "a clapping",
+            "a loud applause",
+            "an audience claps continuously",
+            "an audience applauding continuously",
+            "continued clapping",
+            "A single, sharp clap echoed in the room",
+            "The rhythmic clapping of hands created a percussive sound",
+            "A soft, quick clap marked the sound of approval",
+            "The faint sound of hands clapping was heard nearby",
+            "A solitary clap broke through the ambient noise"
+        ]
+    },
+    {
+        "event": "clicking",
+        "phrases": [
+            "clicking",
+            "tracks click",
+            "clicking sound effects",
+            "a clicking",
+            "ticks",
+            "gears click",
+            "clicks",
+            "a click repeating",
+            "tracks clicking",
+            "clicking occurs consistently",
+            "clicking repeatedly",
+            "a quick click",
+            "ticks intermittently",
+            "clicking occurs repeatedly",
+            "quick clicking",
+            "a click",
+            "a rapid, regular soft click sounds from nearby",
+            "a click track",
+            "a clicking several times",
+            "repetitive quick ticking",
+            "A quick, sharp clicking sound came from the device",
+            "The faint click of small objects tapping together was audible",
+            "A series of rhythmic clicking noises filled the air",
+            "The brief, distinct sound of a click broke the silence",
+            "A mechanical clicking noise indicated the activation of a switch"
+        ]
+    },
+    {
+        "event": "computer_keyboard",
+        "phrases": [
+            "typing sound",
+            "keyboard",
+            "keyboard sound",
+            "touchscreen typing",
+            "spray sound",
+            "tap tap sound",
+            "typing sound on computer keyboard",
+            "computer keyboard mechanisms",
+            "a man is typing on a computer keyboard",
+            "touchscreen typing",
+            "someone is typing keys on a computer keyboard",
+            "typing on a computer keyboard",
+            "typing on keyboard",
+            "typing noise",
+            "fingers typing on a keyboard",
+            "computer keyboards sound",
+            "The rhythmic tapping of keys created a steady typing sound",
+            "A soft clicking noise accompanied each keystroke",
+            "The sound of rapid typing filled the room with energy",
+            "The faint clatter of a keyboard was heard in the background",
+            "The mechanical clicking of keys indicated active use"
+        ]
+    },
+    {
+        "event": "cough",
+        "phrases": [
+            "cough",
+            "coughing",
+            "boy coughing",
+            "a boy coughs deeply",
+            "coughs",
+            "continuous coughing",
+            "man coughing",
+            "human coughing",
+            "a male coughing",
+            "male coughing",
+            "an adult male coughs",
+            "an adult male clearing his throat",
+            "someone coughs several times in quick succession",
+            "female coughing",
+            "cough sounds",
+            "a male coughs",
+            "a male clearing his throat",
+            "a man cough",
+            "two females coughing",
+            "someone is coughing a number of times",
+            "a cough",
+            "a female coughs",
+            "men cough",
+            "he coughs",
+            "coughing men",
+            "baby coughing",
+            "coughing sounds",
+            "a coughing man",
+            "an adult female coughs",
+            "A sudden, sharp cough broke the silence",
+            "A low, muffled cough was audible nearby",
+            "The repeated sound of coughing filled the air",
+            "A harsh, guttural cough echoed briefly",
+            "The distinct sound of a cough indicated discomfort"
+        ]
+    },
+    {
+        "event": "cricket",
+        "phrases": [
+            "cricket",
+            "crickets",
+            "crickets chirp intermittently in the background",
+            "crickets at sunset",
+            "crickets chirp continuously in the background",
+            "the sounds of crickets",
+            "crickets are chirping in the forest",
+            "crickets are chirping rapidly and loudly",
+            "crickets are chirping in a forest",
+            "cricket field recording",
+            "synthesized cricket sounds",
+            "a cricket chirping at steady intervals",
+            "cricket chirp",
+            "crickets chirp intermittently",
+            "crickets in bushes",
+            "crickets in recording",
+            "crickets chirp in background",
+            "crickets croak in the background",
+            "crickets chirp in the background",
+            "cricket sound recorded ",
+            "the chirp of crickets in the background",
+            "crickets are making a steady sound",
+            "crickets chirping in the background",
+            "croaking crickets",
+            "a cricket chirping loudly",
+            "crickets are chirping in the mountains",
+            "a continuous chorus of cricket sounds",
+            "crickets are chirping in background",
+            "crickets vocalize",
+            "crickets chirp from a distance",
+            "The rhythmic chirping of crickets filled the night air",
+            "A faint, continuous cricket song was audible in the background",
+            "The persistent chirping of a cricket added to the nocturnal ambiance",
+            "The high-pitched chirping of crickets created a natural melody",
+            "The distinct sound of crickets marked the quiet of the night"
+        ]
+    },
+    {
+        "event": "croak",
+        "phrases": [
+            "frogs croak",
+            "frogs chirp",
+            "multiple frogs croak at the same time",
+            "numerous number of frogs croaking",
+            "frogs vocalize",
+            "several frogs croak",
+            "long croaking from a frog",
+            "croaks",
+            "long groaning from a frog",
+            "frogs croak nearby",
+            "a frog croaks repeatedly",
+            "frogs making croaking sounds",
+            "frog sticks tongue out",
+            "A low, guttural croak came from a nearby frog",
+            "The harsh croaking of a frog echoed across the pond",
+            "A deep, raspy croak marked the frog's call",
+            "The repetitive croaking of frogs created a natural chorus",
+            "The faint sound of a croak was audible in the distance"
+        ]
+    },
+    {
+        "event": "crowd",
+        "phrases": [
+            "crowd",
+            "crowds",
+            "crowds gather",
+            "crowd is making noise from a medium perspective",
+            "large crowd",
+            "crowd of people",
+            "murmuring crowd",
+            "a loud bustling crowd",
+            "the noisy crowd",
+            "a large crowd of people are noisy",
+            "people crowd",
+            "a bustling crowd",
+            "the noise of the crowd",
+            "crowded human voices",
+            "a crowd hubbub",
+            "crowd voice",
+            "a large noisy crowd",
+            "a noisy crowd",
+            "crowd with human voices",
+            "people crowding",
+            "a large noisy crowd having fun",
+            "background noise at a crowded event",
+            "the noise of a crowd",
+            "background noise of a crowd",
+            "a large crowd mummers",
+            "a crowd's noise",
+            "a noisy crowd of people",
+            "a crowd makes hubbub",
+            "a crowd of people is milling around loudly and very close",
+            "a crowd make noise",
+            "The indistinct murmur of a crowd filled the background",
+            "A sudden cheer erupted from the crowd, breaking the ambient noise",
+            "The sound of overlapping conversations created a lively atmosphere",
+            "A wave of applause and chatter came from the large gathering",
+            "The rhythmic chanting of a crowd echoed through the area"
+        ]
+    },
+    {
+        "event": "crowing",
+        "phrases": [
+            "crowing",
+            "crowing sounds",
+            "a crowing sound",
+            "cock crowing",
+            "roosters caw",
+            "rooster crows",
+            "more crowing",
+            "rooster is crowing",
+            "various chicken crowing",
+            "crowing from bird",
+            "a cockrel crowing",
+            "a crowing",
+            "a cock crowing",
+            "cawing crows",
+            "roosters are crowing",
+            "crowing roosters",
+            "rooster crowing",
+            "a chicken crows",
+            "crows",
+            "crow caws",
+            "chickens cawing",
+            "some kind of bigger bird crows continuously",
+            "a rooster is crowing",
+            "a chicken is crowing",
+            "roosters and chickens are crowing",
+            "fowl are crowing",
+            "a chicken crowing",
+            "crows coo",
+            "a chicken is cawing",
+            "the sound of crows",
+            "A loud, rhythmic crowing sound came from a nearby rooster",
+            "The distinctive multi-syllable crow of a rooster filled the morning air",
+            "The sharp crowing of a rooster signaled the break of dawn",
+            "The repetitive crowing of a rooster echoed through the farm",
+            "A high-pitched crowing noise indicated the presence of a rooster nearby"
+        ]
+    },
+    {
+        "event": "crumpling",
+        "phrases": [
+            "crumpling",
+            "crumpling paper",
+            "crumpling occurs continuously",
+            "crumbling paper",
+            "an item crumpling",
+            "crumpling an object",
+            "crinkling",
+            "crumpling some material",
+            "paper crumpling",
+            "crumpling occurs repeatedly",
+            "crumpling with surface contact",
+            "paper is crinkling and crumpling",
+            "the sound of crumpling",
+            "crinkling a rubber object",
+            "plastic is crinkling and crumpling",
+            "paper is crumbling and crinkling",
+            "the crinkling of plastic",
+            "the sound of crumpling paper",
+            "paper is crumpling consistently",
+            "crumpling some packet",
+            "crumpling of an object",
+            "crackling paper",
+            "continuous crumpling",
+            "crumpling of material",
+            "something crumples and crinkles",
+            "wrapper crinkling",
+            "paper is being crumpled and crinkled",
+            "paper is crumpling continuously",
+            "paper is crumpled and crinkled",
+            "A soft, crackling sound came from crumpling paper",
+            "The distinct rustling of material being crumpled filled the air",
+            "A faint crumpling noise marked the handling of a flexible sheet",
+            "The sharp, crisp sound of aluminum foil crumpling was audible",
+            "The rhythmic sound of crumpling paper created a subtle texture in the background"
+        ]
+    },
+    {
+        "event": "crunch",
+        "phrases": [
+            "crunch",
+            "cookie crunch",
+            "a crunch",
+            "crunching",
+            "crisp crunches",
+            "crumpling sound",
+            "A loud, crisp crunch came from the brittle material breaking",
+            "The sharp crunch of footsteps on gravel was clearly audible",
+            "A faint, repeated crunching sound marked the crushing of a substance",
+            "The distinct crunch of a brittle object breaking was unmistakable",
+            "The rhythmic crunching of leaves underfoot created a natural soundscape"
+        ]
+    },
+    {
+        "event": "crying",
+        "phrases": [
+            "crying",
+            "an adult male sighs while crying",
+            "an adult male sobs while crying",
+            "a woman sobs",
+            "crying (fake crying)",
+            "a woman sobbing",
+            "a man sobbing",
+            "a young woman crying",
+            "occasional sobbing",
+            "a woman sobs loudly",
+            "a person makes sobbing noises",
+            "crying over a television",
+            "an adult female is sobbing",
+            "crying noise",
+            "someone sobs",
+            "crying sounds",
+            "someone is sobbing",
+            "someone is crying in a cartoonish manner",
+            "someone sobbing",
+            "someone sobs intermittently",
+            "a man pretend crying",
+            "a person sobbing",
+            "someone else is sobbing",
+            "a man sobs",
+            "woman crying",
+            "a woman is crying hysterically in pain",
+            "a young boy crying",
+            "someone cries intermittently",
+            "a person is crying and sobbing",
+            "crying noises",
+            "A soft, trembling sobbing sound broke the silence",
+            "The faint, erratic breathing noises of crying were audible",
+            "A rhythmic, muffled crying sound filled the air",
+            "The high-pitched wailing of someone crying echoed in the distance",
+            "The repetitive sobbing noises created an emotional atmosphere"
+        ]
+    },
+    {
+        "event": "dental_drill",
+        "phrases": [
+            "dental drill",
+            "the sound of a dental drill",
+            "a dentist drill",
+            "a dental drill",
+            "a drill is being used at the dentist",
+            "A high-pitched, whirring sound came from the dental drill",
+            "The sharp, continuous whine of the drill filled the room",
+            "A piercing sound accompanied the use of the dental drill",
+            "The faint vibration noise of the drill was audible in the background",
+            "The rhythmic, mechanical sound of the dental drill was unmistakable"
+        ]
+    },
+    {
+        "event": "dial_tone",
+        "phrases": [
+            "dial tone",
+            "electronic dial tone",
+            "dial tones",
+            "electronic touch tone telephone dialing",
+            "telephones dial and ring",
+            "dialing tones",
+            "a telephone dialing tone ringing",
+            "a telephone dial tone",
+            "a telephone dial tone occurs",
+            "people dial",
+            "keypress tone",
+            "dialing",
+            "dialing occurs on a telephone",
+            "a telephone is dialed once",
+            "a telephone dialing tone",
+            "keypress tones",
+            "a phone dialing",
+            "men are dialing on a phone with a mechanical tone",
+            "a man presses buttons creating tones on a telephone",
+            "a wireless phone is turning on and dialing",
+            "A steady, monotone hum indicated the line was ready",
+            "The low, continuous tone of the dial tone filled the receiver",
+            "A faint, consistent hum marked the sound of the dial tone",
+            "The rhythmic, unchanging tone of a dial signal was audible",
+            "The distinct sound of a dial tone confirmed the connection"
+        ]
+    },
+    {
+        "event": "ding",
+        "phrases": [
+            "ding",
+            "a ding",
+            "a ping",
+            "dinging",
+            "a ting occurs",
+            "a metal ding",
+            "a tinging occurs",
+            "ding repetitions",
+            "a bell ding",
+            "a bell donging",
+            "ding sounds from a video game",
+            "an electronic ding-dong",
+            "a ting",
+            "dings",
+            "a final ding",
+            "a ding repeats",
+            "a dinging sound",
+            "a metal ting",
+            "a single ding",
+            "a metal bell ding",
+            "a bell tings",
+            "ding-dongs",
+            "a ding-dong sound",
+            "A quick, high-pitched ding sound rang out clearly",
+            "The soft, metallic ding of a small object being struck was audible",
+            "A sharp ding noise echoed briefly in the room",
+            "The faint, rhythmic ding of a bell added a delicate touch",
+            "A single, crisp ding marked an event in the background"
+        ]
+    },
+    {
+        "event": "ding-dong",
+        "phrases": [
+            "a ding-dong",
+            "ding-dongs",
+            "ding-dong sounds",
+            "a ding-dong sound",
+            "a ding-dong sound effect plays",
+            "a ding-dong sound in the background",
+            "an electronic ding-dong",
+            "ding-dong sound",
+            "a ding-dong in a small room",
+            "a doorbell ding-dongs",
+            "A melodic ding-dong chime rang out",
+            "The rhythmic two-tone ding-dong sound filled the air",
+            "A clear, cheerful ding-dong announced someone's presence",
+            "The soft echo of a ding-dong chime lingered briefly",
+            "The distinct ding-dong of a doorbell was unmistakable"
+        ]
+    },
+    {
+        "event": "dog",
+        "phrases": [
+            "dog",
+            "the dog sound",
+            "dogs",
+            "a dog making a sound",
+            "dog contact sounds",
+            "dog sounds",
+            "a dog",
+            "pet dog barking",
+            "dog sound effect",
+            "the sound of a dog",
+            "dog barking",
+            "a dog barks in response",
+            "dog yips",
+            "a dog is reacting to something violently",
+            "real dog bark",
+            "dog surface contact",
+            "dogs bark several times",
+            "bark",
+            "a dog barking in response",
+            "dog barks twice",
+            "several dogs make bow-wow",
+            "dog yipping",
+            "a dog is trying to bark in its sleep",
+            "dogs fight",
+            "A low, rumbling growl came from the dog",
+            "The sharp bark of a dog echoed nearby",
+            "A soft whimper was heard as the dog communicated its emotions",
+            "The rhythmic panting of a dog created a steady background noise",
+            "The excited yapping of a dog filled the air"
+        ]
+    },
+    {
+        "event": "door",
+        "phrases": [
+            "door",
+            "door closing",
+            "a door",
+            "door opening",
+            "doors",
+            "a closing door",
+            "door opens",
+            "door bang",
+            "door shutting",
+            "door slamming",
+            "closing an old wooden door",
+            "a door closing",
+            "the closing of a door",
+            "closes a door",
+            "door open and close",
+            "the sound of a door",
+            "door clanking",
+            "door slams",
+            "a door to a block of flats is closing",
+            "man closing door",
+            "a door opening and then closing",
+            "a person opening and closing a door",
+            "a door opens and closes shut",
+            "an opening of a door",
+            "someone is closing a door in a bathroom",
+            "a door shut",
+            "the opening and closing of a door",
+            "A creaking sound marked the slow opening of a door",
+            "The sharp slam of a door closing echoed briefly",
+            "A faint squeak accompanied the movement of the door hinges",
+            "The rhythmic knocking on a door was clearly audible",
+            "The soft thud of a door shutting filled the room"
+        ]
+    },
+    {
+        "event": "doorbell",
+        "phrases": [
+            "doorbell",
+            "doorbell rings",
+            "doorbell sounds",
+            "doorbell ringing",
+            "apartment doorbell",
+            "a doorbell",
+            "doorbell chimes",
+            "a doorbell ding-dongs",
+            "a doorbell with an electric chime is ringing",
+            "a doorbell rings with ding-dong sounds",
+            "a doorbell chime",
+            "a door bell",
+            "doorbell noises",
+            "doorbells",
+            "a ringing doorbell",
+            "a bell on a shop door",
+            "mechanical door bell",
+            "a doorbell rings",
+            "A sharp, melodic chime of the doorbell rang out",
+            "The rhythmic ding-dong of a doorbell filled the air",
+            "A quick, clear doorbell tone announced a visitor",
+            "The faint ringing of a doorbell was heard in the background",
+            "A cheerful ding-dong sound echoed briefly as the doorbell was pressed"
+        ]
+    },
+    {
+        "event": "drill",
+        "phrases": [
+            "drill",
+            "hole is being drilled",
+            "loud drilling",
+            "drilling",
+            "drilling noises repeat several times",
+            "the sounds of a drill",
+            "a tool loudly drills into something",
+            "a power tool drilling again",
+            "a loud drill",
+            "continuous drilling loudly",
+            "a drill is drilling and subsequently ceases operation",
+            "the drilling of an object",
+            "a power tool sharply drilling",
+            "a power tool is drilling",
+            "the sounds of drilling",
+            "a power tool continues to make drilling noises",
+            "a drill spins loudly nearby and then stops",
+            "drill running",
+            "drill runs",
+            "drill running and shutting down",
+            "a drill drills repeatedly",
+            "a drilling and whirring sound",
+            "a loud drill into something and then turns off",
+            "a drill runs repeatedly",
+            "the sound of a drill being used repeatedly",
+            "drill working multiple times",
+            "drill getting stuck and stopping",
+            "someone is drilling a sheet of plywood",
+            "continuous drilling",
+            "power tools drilling",
+            "A high-pitched whirring sound came from the drill",
+            "The sharp grinding noise of the drill filled the room",
+            "A repetitive buzzing sound marked the drill's operation",
+            "The faint vibration noise of a drill was clearly audible",
+            "The rhythmic hum of the drill created a steady background noise"
+        ]
+    },
+    {
+        "event": "ducks",
+        "phrases": [
+            "ducks",
+            "ducks quack",
+            "ducks quacking",
+            "ducks call",
+            "ducks quack up close",
+            "ducks quacking continuously",
+            "ducks quack intermittently",
+            "ducks quack continuously",
+            "ducks quacking loudly",
+            "quacking ducks",
+            "ducks quaking",
+            "several ducks quack one after another",
+            "ducks quack repetitively",
+            "ducks quack loudly nearby",
+            "ducks quack loudly",
+            "ducks quack several times nearby",
+            "ducks quack multiple times in the distance",
+            "ducks squawk repeatedly",
+            "ducks are making waterfowl sounds",
+            "real ducks quack",
+            "multiple ducks quack repeatedly",
+            "a series of ducks quacking",
+            "ducks quaking continuously",
+            "a group of ducks are quacking",
+            "multiple ducks quack continuously",
+            "ducks quacking irregularly",
+            "several ducks are quacking intermittently",
+            "a number of ducks quacking at once",
+            "ducks respond to calls",
+            "a number of ducks quack",
+            "A loud quacking sound came from a group of ducks",
+            "The rhythmic quack of a duck filled the air",
+            "A soft, repetitive quacking noise was heard nearby",
+            "The distinct quack of ducks created a lively soundscape",
+            "The faint calls of ducks echoed in the background"
+        ]
+    },
+    {
+        "event": "echo",
+        "phrases": [
+            "echo",
+            "distant voices echo",
+            "echoes",
+            "echoing",
+            "A delayed, faint echo repeated the original sound",
+            "The sound of footsteps echoed off the walls",
+            "A clear, resonant echo filled the spacious area",
+            "The rhythmic echo of a voice lingered briefly",
+            "The distant echo of a clap was audible across the canyon"
+        ]
+    },
+    {
+        "event": "electric_shaver",
+        "phrases": [
+            "shaver",
+            "electric shaver",
+            "electric shaver sounds",
+            "electric razor sounds",
+            "electric shaver sound",
+            "an electric toothbrush buzzes",
+            "an electric shaver is switched off",
+            "an electric razor buzzes",
+            "the sound of a shave machine",
+            "sound of an electric shaver",
+            "a long buzz shaves something of wood",
+            "an electric shaver running",
+            "an electric shaver buzzes",
+            "an electric shaver is turning on and off",
+            "electric hair clipper being used",
+            "A steady buzzing sound came from the electric shaver",
+            "The sharp hum of the shaver filled the room",
+            "A faint vibration noise accompanied the shaver's operation",
+            "The rhythmic buzzing of the shaver created a consistent background tone",
+            "The distinct sound of the electric shaver was unmistakable"
+        ]
+    },
+    {
+        "event": "emergency_vehicle",
+        "phrases": [
+            "emergency vehicle",
+            "an ambulance blares its siren",
+            "emergency vehicle siren",
+            "ambulance",
+            "an emergency vehicle turns on the siren",
+            "an emergency vehicle siren changes to a higher pitched siren",
+            "an ambulance moving with its siren on",
+            "ambulance siren",
+            "an emergency vehicle approaches with its siren blaring",
+            "ambulance siren wail",
+            "an ambulence using its siren",
+            "emergency vehicle siren passing by",
+            "police emergency vehicle siren",
+            "a vehicle with sirens blaring approaches",
+            "a police car siren wails quickly",
+            "emergency vehicle siren blasts",
+            "an emergency vehicle passes by quickly",
+            "emergency vehicle's siren blares",
+            "emergency vehicle siren blaring steadily",
+            "a two-tone emergency vehicle siren blows as it approaches, passes by, and then fades into the distance",
+            "an emergency vehicle's siren wails",
+            "emergency vehicle sirens wail multiple times",
+            "a siren ringing of a passing emergency vehicle",
+            "an emergency siren passes by",
+            "police car with sirens blaring passing by",
+            "an ambulance is blaring its siren",
+            "fast urgent loud emergency siren",
+            "A loud, wailing siren signaled the approach of an emergency vehicle",
+            "The sharp, oscillating siren of an emergency vehicle filled the air",
+            "A high-pitched siren noise indicated the vehicle's urgency",
+            "The repetitive blaring of a siren announced the presence of an emergency vehicle",
+            "The distinct Doppler-shifted siren sound moved past quickly"
+        ]
+    },
+    {
+        "event": "engine",
+        "phrases": [
+            "engine",
+            "engine run",
+            "running engine",
+            "another engine",
+            "engine running",
+            "car engine",
+            "engine motor running",
+            "engine operate",
+            "an engine",
+            "motor engine",
+            "boat engine",
+            "an engine run",
+            "auto engine",
+            "light engine",
+            "engine sound effect",
+            "a engine runs",
+            "engine running once again",
+            "the engine runs",
+            "a second engine",
+            "engine in idle",
+            "its engine runs",
+            "engine sound",
+            "its engine running",
+            "motorboat engine",
+            "A deep, steady hum of an engine filled the space",
+            "The rhythmic thrum of the engine indicated it was in operation",
+            "A sharp rev of the engine broke the silence",
+            "The faint vibration noise of an engine was audible in the background",
+            "The low growl of the engine created a mechanical ambiance"
+        ]
+    },
+    {
+        "event": "engine_knocking",
+        "phrases": [
+            "engine knocks",
+            "a motor knocks",
+            "a car engine knocking",
+            "engine knocking sounds",
+            "a car makes an engine knocking sound",
+            "knocking engine",
+            "an engine runs and knocks",
+            "a motor runs and knocks",
+            "knocking engines",
+            "engine knocking noises repeatedly",
+            "engine knocking",
+            "a vehicle engine runs knocking",
+            "a medium engine making knocking sounds",
+            "car making engine knocking sounds",
+            "a car's engine is knocking",
+            "clicking from a engine",
+            "an engine knocks while running",
+            "an engine that is knocking starting",
+            "an engine making a knocking noise",
+            "a medium engine makes knocking noises",
+            "engine sounds with tapping and thumping",
+            "a vehicle engine idles and knocks briefly",
+            "the engine is knocking",
+            "clicking of an engine",
+            "a motorcycle engine knocks",
+            "a car engine knocks",
+            "A sharp, metallic knocking sound came from the engine",
+            "The rhythmic pinging noise indicated a malfunction in the engine",
+            "A faint, repetitive knocking sound was audible as the engine ran",
+            "The distinct metallic ping of engine knocking was unmistakable",
+            "A loud, irregular knocking noise signaled engine trouble"
+        ]
+    },
+    {
+        "event": "engine_starting",
+        "phrases": [
+            "its engine starts",
+            "engine starts",
+            "a engine starting",
+            "car starts",
+            "a vehicle starter turns over",
+            "a motorboat starts up",
+            "engine starting",
+            "engine attempting to start",
+            "motor startup",
+            "a motor engine starting",
+            "an engine starts a second time",
+            "engine being started again",
+            "a engine starts up",
+            "an engine starts",
+            "an engine takes a few seconds to start",
+            "a motor starts",
+            "engines start",
+            "motor engine starts",
+            "a man starts an engine",
+            "an engine start",
+            "A loud, cranking noise marked the engine starting",
+            "The sharp whir of the starter motor was followed by the engine's hum",
+            "A faint clicking sound preceded the engine's ignition",
+            "The rhythmic revving noise indicated the engine was coming to life",
+            "A brief sputtering sound was followed by the steady hum of the engine"
+        ]
+    },
+    {
+        "event": "explosion",
+        "phrases": [
+            "explosion",
+            "an explosion sound",
+            "an energy-type explosion sound",
+            "explosion sounds",
+            "large explosion",
+            "a quick loud explosion",
+            "an explosion occurs",
+            "huge explosion",
+            "a explosion sound",
+            "an explosion",
+            "sudden explosion",
+            "another explosion explodes",
+            "a blast",
+            "an explosion sound effect",
+            "large deep explosion",
+            "a synthetic explosion sound",
+            "a disappointing explosion",
+            "an explosion noise",
+            "an explosive sound",
+            "an explosion happens",
+            "a sound effect explosion",
+            "A loud, booming explosion echoed across the area",
+            "The sharp crack of an explosion was followed by a rumbling noise",
+            "A sudden, deafening blast filled the air",
+            "The distinct sound of a powerful explosion was unmistakable",
+            "A deep, resonant boom signaled the detonation of explosives"
+        ]
+    },
+    {
+        "event": "fart",
+        "phrases": [
+            "fart",
+            "fart over and over",
+            "a fart",
+            "farts",
+            "farting",
+            "fart sound effects",
+            "a fart sound effect",
+            "someone farts",
+            "a fart loop",
+            "a long fart",
+            "the sound of a fart",
+            "a sound of a fart echoing in a sound booth",
+            "a fart sound effect plays",
+            "fart sounds",
+            "a recorded fart",
+            "a funny fart",
+            "men fart",
+            "human farting",
+            "a fart escapes",
+            "someone is composing their own farts in a loop",
+            "a series of farting",
+            "someone is making a fart sound effect",
+            "fart noises",
+            "A brief, low-pitched fart noise broke the silence",
+            "The sharp, quick sound of flatulence was clearly audible",
+            "A faint, muffled fart sound lingered briefly",
+            "The repetitive, comical sound of flatulence created a humorous tone",
+            "A loud, resonant fart noise filled the room momentarily"
+        ]
+    },
+    {
+        "event": "female_singing",
+        "phrases": [
+            "a female singer performs several songs",
+            "a female singer performs multiple times",
+            "a female singer is performing",
+            "a female sing",
+            "a female singer performs",
+            "a female sings",
+            "a female vocalist",
+            "female singer performs",
+            "a female singer sings",
+            "a female voice sings multiple times",
+            "a female is singing with lots of reverb",
+            "a female singer sing",
+            "a female voice sings",
+            "a female singer",
+            "two women sing",
+            "a female singer speaking",
+            "female voices sing a song",
+            "a female singing with mechanisms",
+            "male and female singers perform",
+            "female singers perform",
+            "a female singer singing",
+            "singing with female vocals",
+            "a female voice sings continuously",
+            "a female voice is singing a song",
+            "female sings",
+            "a female singer performing",
+            "a female sings along",
+            "a female singer perform",
+            "female vocals sing",
+            "A soft, melodic voice of a woman sang a gentle tune",
+            "The clear, high-pitched singing of a woman filled the air",
+            "A rich, vibrant tone marked the woman's singing voice",
+            "The rhythmic, soothing melody of female singing created a peaceful ambiance",
+            "The faint, harmonious hum of a woman singing was audible in the distance"
+        ]
+    },
+    {
+        "event": "female_speech",
+        "phrases": [
+            "female speech",
+            "female speeches",
+            "female english speech",
+            "a spoken female voice",
+            "woman's speech",
+            "female speaking",
+            "a female speaking",
+            "a single female voice speaking",
+            "a female speaks",
+            "female making speech",
+            "brief female speech",
+            "continuous female speech",
+            "a woman speeches",
+            "female speaks",
+            "a woman speeching",
+            "a womans dialogue",
+            "women's speech",
+            "female voice",
+            "a woman is giving a confident speech",
+            "a female voice sporadically speaks",
+            "a female speaker speaking",
+            "female voice speaking",
+            "a female voice is narrating",
+            "a female voice speaks",
+            "a female voice speaking",
+            "repeated female speech",
+            "a woman having a narration",
+            "additional female speech",
+            "a females voice speaks out",
+            "a young woman speeches",
+            "A calm, steady voice of a woman was heard nearby",
+            "The rhythmic articulation of a woman's speech filled the space",
+            "The clear and deliberate tone of a female voice conveyed confidence",
+            "A soft-spoken female voice was audible in the background",
+            "The cheerful, lively speech of a woman added energy to the environment"
+        ]
+    },
+    {
+        "event": "filing_(rasp)",
+        "phrases": [
+            "dry scraping",
+            "filing (rasping) sounds",
+            "metal filing and scrapping a surface",
+            "filing (rasp) sounds",
+            "a filing sound",
+            "a man is filing something",
+            "wood being filed",
+            "a file rasps against a surface several times nearby",
+            "a man uses a filing rasp",
+            "rasping of file",
+            "rasping",
+            "a rasping filing continues",
+            "metal filing",
+            "a man is filing a piece of wood",
+            "scrapping and filing of wood",
+            "a rasping sound of filing",
+            "a rasping and filing sound",
+            "a file rubbing against a surface loudly",
+            "sharp filing sounds",
+            "A sharp, rasping sound came from the file against the metal",
+            "The rhythmic scraping noise of filing filled the workshop",
+            "A coarse, grating sound marked the action of a file on wood",
+            "The faint, repetitive rasping sound accompanied the filing motion",
+            "The distinct, metallic grinding noise of filing was unmistakable"
+        ]
+    },
+    {
+        "event": "fire",
+        "phrases": [
+            "fire",
+            "fires",
+            "fire effects",
+            "fire cracking sound",
+            "a fire burning inside",
+            "fire burning sound",
+            "A soft crackling sound came from the burning fire",
+            "The rhythmic popping of flames filled the air",
+            "A low, steady roar indicated the presence of a large fire",
+            "The faint hiss of burning material was clearly audible",
+            "The sharp, crackling noise of fire created a lively soundscape"
+        ]
+    },
+    {
+        "event": "fire_alarm",
+        "phrases": [
+            "fire alarm interior field recording",
+            "fire alarm",
+            "a fire alarm is tested and switched off",
+            "a fire alarm beeps continuously",
+            "a fire alarm sounds multiple times",
+            "fire alarm is sounding in a hospital",
+            "a fire alarm repeatedly",
+            "fire alarm with mechanisms",
+            "A loud, repetitive alarm tone blared across the room",
+            "The sharp, high-pitched wailing of the fire alarm filled the building",
+            "A rhythmic beeping noise signaled an emergency",
+            "The continuous blare of the fire alarm demanded attention",
+            "The distinct, piercing sound of a fire alarm was unmistakable"
+        ]
+    },
+    {
+        "event": "fire_engine",
+        "phrases": [
+            "fire engine",
+            "fire engine siren-blaring",
+            "fire trucks",
+            "fire trucks sound off",
+            "fire brigade signal",
+            "fire trucks sound",
+            "firetruck getting closer",
+            "fire truck pulls out",
+            "a fire truck driving",
+            "fire engines",
+            "fire engine approaching",
+            "fire truck",
+            "a fire engine is passing by",
+            "fire truck sirens outside",
+            "a fire truck is passing by",
+            "fire engine drives",
+            "the fire truck drives by",
+            "a fire engine starts and honks horn",
+            "a fire engine moving",
+            "a fire truck is driving by",
+            "crackling fire engine sirens",
+            "a fire truck is approaching",
+            "fire engines sound",
+            "a fire engine is making brakes",
+            "a fire engine is accelerating and making engine sounds",
+            "fire engine horns",
+            "a loud fire engine",
+            "a fire truck runs",
+            "a fire truck drives with sirens",
+            "A loud, wailing siren signaled the approach of a fire engine",
+            "The sharp, oscillating siren of a fire engine filled the air",
+            "A high-pitched siren noise indicated the urgency of the situation",
+            "The repetitive blaring of a siren announced the presence of a fire engine",
+            "The deep, rumbling engine noise of the fire truck accompanied its siren"
+        ]
+    },
+    {
+        "event": "fireworks",
+        "phrases": [
+            "fireworks",
+            "firework booms",
+            "fireworks sounds",
+            "fireworks go off",
+            "fireworks are echoing across a valley",
+            "fireworks noises",
+            "firework sounds",
+            "firework celebration",
+            "fireworks explode",
+            "firecrackers",
+            "fireworks hiss",
+            "occasional fireworks",
+            "fireworks are being recorded inside a house near an open window",
+            "fireworks explode crackling",
+            "fireworks burst",
+            "fireworks pop",
+            "fireworks near and far",
+            "fireworks going off",
+            "a series of several fireworks exploding one after another",
+            "the sounds of fireworks",
+            "fireworks burst loudly",
+            "fireworks explode and echo",
+            "multiple fireworks pop and crackle",
+            "a fireworks display takes place",
+            "fireworks crackle",
+            "fireworks fizzle",
+            "fireworks are going off outside an apartment",
+            "firecrackers go off",
+            "firecrackers burst",
+            "the sound of fireworks",
+            "A loud, cracking noise marked the explosion of fireworks",
+            "The sharp whistling sound of a firework ascending filled the air",
+            "A series of rapid pops and bangs created an energetic soundscape",
+            "The deep boom of a large firework echoed across the area",
+            "The faint crackling of fireworks added a festive ambiance"
+        ]
+    },
+    {
+        "event": "fixed-wing_aircraft",
+        "phrases": [
+            "fixed-wing aircraft",
+            "an aircraft's engine",
+            "airplane",
+            "aircraft",
+            "aircraft approach",
+            "the sound of a fixed-wing aircraft",
+            "the sounds of an aircraft",
+            "the sound of fixed-wing aircraft",
+            "aircraft softly accelerating",
+            "the sound of a fixed-wing airplane",
+            "airplane sounds",
+            "an aircraft engine gets louder as it approaches",
+            "an airplane engine runs consistently",
+            "an aircraft engine swirls in the background",
+            "a muted jet",
+            "plane taking off or landing",
+            "an aircraft intermittently",
+            "aircraft fly",
+            "steady jet engine running",
+            "engine of airplane taking off",
+            "an aircraft's propeller",
+            "large aircraft taking off",
+            "airplane ambience internal ground",
+            "fixed-wing aircraft fly overhead",
+            "humming of a nearby jet engine",
+            "A deep, steady hum of the aircraft filled the sky",
+            "The rhythmic drone of the engine signaled the aircraft's flight",
+            "A faint roar of the aircraft was audible in the distance",
+            "The oscillating whine of the aircraft engine created a mechanical soundscape",
+            "The continuous sound of propellers cutting through the air was distinct"
+        ]
+    },
+    {
+        "event": "fly",
+        "phrases": [
+            "a housefly buzzes briefly",
+            "a housefly buzzes around",
+            "a housefly buzzing sound",
+            "housefly",
+            "a housefly is buzzing",
+            "a housefly",
+            "housefly noise",
+            "housefly sound",
+            "fly sound",
+            "a fly buzzes around",
+            "a fly buzzes by",
+            "a fly buzzing",
+            "A faint buzzing sound marked the presence of a fly",
+            "The sharp, high-pitched buzz of a fly was clearly audible",
+            "The rhythmic droning of a fly added a subtle background noise",
+            "The soft fluttering of fly wings created a delicate vibration in the air",
+            "The persistent buzzing noise of a fly moved erratically around"
+        ]
+    },
+    {
+        "event": "frog",
+        "phrases": [
+            "frog",
+            "a frog croaks several times in a row",
+            "frog is making pulses",
+            "frog croaks",
+            "the sounds of a frog",
+            "a frog croaking repeatedly without stopping",
+            "a frog consistently croaks",
+            "a frog croaks seven times in the foreground",
+            "frog sounds",
+            "a frog croaking sharply several times",
+            "frog croaks twice",
+            "a frog croaking at regular intervals",
+            "a frog croaks multiple times",
+            "frog calls",
+            "frog making croaking sound",
+            "a frog chirps monotonously",
+            "a frog croaks several times",
+            "frog croaks twice again",
+            "the sound of a frog",
+            "a frog is chirping over and over",
+            "a frog is croaking multiple times",
+            "multiple frogs croak together",
+            "a frog repeatedly croaks",
+            "a frog croaks intensively",
+            "a frog continuously croaks at a fast pace",
+            "multiple frogs croak repeatedly",
+            "a frog croaks continuously",
+            "a frog noise from a toy",
+            "a frog",
+            "several frogs croak in rapid succession",
+            "A deep, guttural croak came from the frog",
+            "The rhythmic croaking of frogs filled the air",
+            "A sharp, repetitive frog call echoed in the distance",
+            "The faint chirping of tree frogs added a natural ambiance",
+            "The distinct croak of a bullfrog was clearly audible"
+        ]
+    },
+    {
+        "event": "frying_(food)",
+        "phrases": [
+            "frying food",
+            "food frying",
+            "frying of food",
+            "someone is cooking food in a deep-fat fryer",
+            "oil frying",
+            "steak frying",
+            "someone is frying sausage on a cast iron",
+            "frying",
+            "sausage is being fried on cast iron",
+            "foods being fried",
+            "something frying and crackling and sizzling the whole time",
+            "grease is frying in a skillet",
+            "frying food in a wok",
+            "food cooking",
+            "food being fried",
+            "someone is frying sausages on a stove",
+            "the sound of cooking food in oil or another fat",
+            "frying foods",
+            "a pan of food is frying on the fire",
+            "loud frying of food",
+            "a loud sizzling of food frying",
+            "steak is being fried in oil",
+            "a food item is frying and sizzling",
+            "louder and more vigorous frying sound",
+            "food begins sizzles while frying",
+            "cooking food sizzling",
+            "mixing of sizzling food",
+            "mushrooms are being fried at low temperature",
+            "the sizzling of frying food",
+            "food is frying with sizzling noises",
+            "A loud, sizzling sound came from the hot oil in the pan",
+            "The rhythmic crackling noise of frying food filled the kitchen",
+            "A sharp hissing sound accompanied the food as it cooked",
+            "The faint bubbling noise of frying oil added to the cooking ambiance",
+            "The persistent sizzling of frying food created an energetic soundscape"
+        ]
+    },
+    {
+        "event": "giggle",
+        "phrases": [
+            "giggle",
+            "a young boy making a fake giggle",
+            "a female giggle",
+            "a giggle",
+            "a young female giggles in the foreground",
+            "a young woman's crazy cute giggle",
+            "a female giggles",
+            "an adult female gags",
+            "a small girlish giggle",
+            "cartoon characters are giggling",
+            "a kid laughing comically",
+            "females giggle",
+            "giggling",
+            "a girl laughs hysterically",
+            "a cartoonish voice laughing",
+            "an adult female pretending to sneeze",
+            "cartoon characters laugh",
+            "a short giggle",
+            "giggles",
+            "an adult female giggles",
+            "a female and male giggle",
+            "a clown laugh",
+            "a silly laugh",
+            "a hysterical laugh of a small child",
+            "an adult female laugh",
+            "laughter of a woman",
+            "giggle sounds",
+            "an adult female laughs in the foreground",
+            "a child laughing in response",
+            "A soft, high-pitched giggle broke the silence",
+            "The faint, repetitive giggling of someone was audible nearby",
+            "A cheerful, melodic giggle filled the room with joy",
+            "The rhythmic giggling of a nervous person created a playful tone",
+            "The distinct sound of a giggle was unmistakable"
+        ]
+    },
+    {
+        "event": "glass_shatter",
+        "phrases": [
+            "glasses shatter",
+            "glasses breaking",
+            "shatter",
+            "a glass breaks",
+            "a glass plate is shattering on the floor",
+            "glass shatter",
+            "something heavy shatters a glassy material with deep reverb",
+            "glass breaks",
+            "glass to shatter",
+            "glass breaking sound",
+            "a glass shatter sound",
+            "glass shatters",
+            "glass shattering",
+            "a glass shattering sound",
+            "glass hitting pavement and shattering",
+            "glass breaking",
+            "a glass shatter",
+            "a glass shattering",
+            "glass shatters twice",
+            "glass broken on the floor",
+            "a series of glass shattering",
+            "a glass breaking",
+            "a glass jar is breaking on a tile floor",
+            "glass smashes",
+            "a shattering glass sound",
+            "people are shattering glass",
+            "glass shatter in the background",
+            "glass crashing",
+            "glass shatters loudly",
+            "glass shatters and breaks",
+            "A sharp, high-pitched sound marked the shattering of glass",
+            "The loud, cracking noise of glass breaking filled the air",
+            "A faint tinkling sound followed the initial shatter",
+            "The distinct sound of glass shattering was unmistakable",
+            "The echo of breaking glass lingered briefly in the room"
+        ]
+    },
+    {
+        "event": "goat",
+        "phrases": [
+            "goat",
+            "goats",
+            "goat bleeting",
+            "goats sounding",
+            "goat clears throat",
+            "goats bey",
+            "goat noise",
+            "goats baa",
+            "livestock bleat",
+            "goats footsteps",
+            "goats rustle",
+            "goat cries",
+            "goat bleat",
+            "goats baaing",
+            "goat sounds",
+            "goat bleats",
+            "goat noises",
+            "goat bleating",
+            "goats walk around",
+            "A soft, repetitive bleating sound came from the goat",
+            "The loud, rhythmic bleat of a goat filled the air",
+            "A faint, high-pitched bleating noise was audible nearby",
+            "The distinct call of a goat created a rural ambiance",
+            "The soft, low-pitched bleating of a goat added to the natural soundscape"
+        ]
+    },
+    {
+        "event": "groan",
+        "phrases": [
+            "groan",
+            "groans",
+            "a man deeply groans",
+            "moan",
+            "terrible monster groaning in pain",
+            "a strange creature from the abyss is making a weird groaning growly sound",
+            "a monster is groaning",
+            "a deep groan",
+            "customized hippo groan",
+            "a groan",
+            "animals groaning",
+            "a zombie is groaning",
+            "a single zombie groan",
+            "a groan repeated multiple times",
+            "a low groan is repeated several times",
+            "men groan",
+            "a male groans",
+            "groaning",
+            "a deep sub groan",
+            "A low, guttural groan of pain broke the silence",
+            "The faint, muffled groaning of someone was audible nearby",
+            "A sharp, high-pitched groan indicated disapproval",
+            "The rhythmic groaning of effort filled the room",
+            "The distinct sound of a groan conveyed discomfort"
+        ]
+    },
+    {
+        "event": "growling",
+        "phrases": [
+            "growling",
+            "growls",
+            "growling animals",
+            "growling dog",
+            "growling sounds",
+            "animal growling",
+            "a growling sound effect",
+            "growling intermittently",
+            "a growling animal",
+            "growl effect",
+            "growling noises",
+            "a deep growling voice",
+            "a softer version of a growl loop",
+            "a rhythmic roar",
+            "growling sounds with voice effects",
+            "a roar",
+            "a growling creature",
+            "low growling",
+            "The deep growling sound echoed in the distance",
+            "There was a guttural growl, warning of potential danger",
+            "The growling noise carried a sense of threat and aggression",
+            "Low growling sounds could be heard, signaling anger",
+            "The growl reverberated, creating an ominous atmosphere"
+        ]
+    },
+    {
+        "event": "grunt",
+        "phrases": [
+            "grunt",
+            "a grunt",
+            "an adult male grunting with exertion",
+            "painful male grunt sounds",
+            "a man grunts in a video game",
+            "painful male hurt sounds",
+            "a grunt from a man",
+            "a deep grunt",
+            "a man deeply grunts",
+            "A short grunt broke the silence",
+            "Grunting sounds were heard in the background",
+            "The grunt was low and rough, almost animalistic",
+            "Short, abrupt grunts punctuated the air",
+            "A deep grunt hinted at exertion or irritation"
+        ]
+    },
+    {
+        "event": "gunshot",
+        "phrases": [
+            "gunshot",
+            "gun shot",
+            "rifle shot",
+            "gunfire",
+            "shots",
+            "gun shoot",
+            "gunfires",
+            "shotsgun shots",
+            "shots sounds with a reverb effect",
+            "gun fire",
+            "gunfire with echoes",
+            "gun shots",
+            "gun fires",
+            "gunshot sounds",
+            "gunfire shots",
+            "gunshots",
+            "gunshot sound effects",
+            "shooting",
+            "gunshot pops",
+            "rifle shooting",
+            "a sniper rifle firing shot sound effect",
+            "a shot gun is blasting loudly with some reverb",
+            "loud arbalette shot",
+            "a gun shot",
+            "a shotgun is being fired on a training range",
+            "gun shots with echoes",
+            "gunfire echoes",
+            "gunshots echo",
+            "shots sounds from a pheasant shoot",
+            "gun shot noise",
+            "A loud gunshot pierced the air",
+            "Gunshots echoed sharply in the distance",
+            "The crack of a gunshot was sudden and startling",
+            "Multiple gunshots rang out in quick succession",
+            "The sharp report of a gunshot reverberated briefly"
+        ]
+    },
+    {
+        "event": "gurgling",
+        "phrases": [
+            "gurgling",
+            "gurgle",
+            "gargling",
+            "gargle",
+            "gurgling liquid",
+            "low gurgling",
+            "gurgles",
+            "low pitched gurgling",
+            "gurgling underwater",
+            "water gurgling",
+            "gurgling water",
+            "gargling with water",
+            "water gurgling repeatedly",
+            "gargling sounds",
+            "liquid gurgling",
+            "gurgling down a drain",
+            "gurgling water pouring",
+            "loud gurgling of water",
+            "water gurgling continuously",
+            "gurgling of water",
+            "water continuously gurgling",
+            "water gurgling vigorously",
+            "A gurgling sound of water was heard flowing steadily",
+            "The bubbling gurgle grew louder as the liquid poured",
+            "Gurgling noises came from the narrow stream nearby",
+            "The gurgling sound resembled water passing through a constriction",
+            "The rhythmic gurgling of the liquid was soothing"
+        ]
+    },
+    {
+        "event": "hammer",
+        "phrases": [
+            "hammer",
+            "mechanisms hammer",
+            "a hammer hammers",
+            "hammering",
+            "mono jackhammer",
+            "a jackhammer pounds",
+            "machines hammer",
+            "hammer being used",
+            "a hammer strikes",
+            "a hammer pounds a hard surface",
+            "a hammer",
+            "a person hammers a solid object",
+            "striking with a hammer",
+            "a hammer hits",
+            "hammer sounds",
+            "tools hammering",
+            "hammering several times",
+            "a hammer pounding",
+            "a hammer pounds repeatedly",
+            "a series of loud metal hammering",
+            "hammering with surface contact",
+            "a man uses a hammer",
+            "hammer sounds",
+            "a hammering",
+            "a person hammers on a surface",
+            "hammer tap",
+            "someone is using a hammer on a construction site",
+            "hammering an object",
+            "The rhythmic hammering sound echoed through the workshop",
+            "A series of sharp hammer blows could be heard",
+            "The hammering noise was consistent and deliberate",
+            "The metallic clang of the hammer striking was distinct",
+            "Hammering sounds punctuated the otherwise quiet environment"
+        ]
+    },
+    {
+        "event": "helicopter",
+        "phrases": [
+            "helicopter",
+            "helicopter rotor",
+            "helicopter sound",
+            "helicopter sounds",
+            "helicopter moving away",
+            "helicopter blades",
+            "quadcopter",
+            "quadcopter noise",
+            "helicopter engine",
+            "propeller sound of a helicopter",
+            "propeller sounds",
+            "helicopters start flying",
+            "helicopter flying away",
+            "helicopter rotors",
+            "propeller airscrew sounds",
+            "helicopter taking off",
+            "electronic helicopter sound",
+            "propeller noise",
+            "propellers turn",
+            "a propeller sound",
+            "helicopter flying",
+            "The distinct whirring of helicopter blades filled the air",
+            "A low-pitched chopping sound indicated a helicopter nearby",
+            "The helicopter's rotor noise grew louder as it approached",
+            "The steady beat of the rotor blades was unmistakable",
+            "Helicopter sounds hovered persistently in the background"
+        ]
+    },
+    {
+        "event": "hiccup",
+        "phrases": [
+            "hiccup",
+            "hiccupping",
+            "a hiccup sound",
+            "hiccuping twice",
+            "hiccuping",
+            "baby hiccup",
+            "hiccups",
+            "hiccup sounds",
+            "their hiccup sounds",
+            "an adult pretending to hiccup",
+            "a woman hiccup",
+            "a hiccup occurs",
+            "a woman hiccupping",
+            "hiccup in the background",
+            "women hiccup",
+            "male hiccuping",
+            "a man hiccupping",
+            "a series of hiccups",
+            "baby hiccuping",
+            "a hiccup",
+            "people hiccup",
+            "a person hiccups multiple times",
+            "hiccupping in the background",
+            "man hiccuping",
+            "hiccup sounds in the background",
+            "she hiccups",
+            "hiccupping sounds",
+            "people are hiccupping in the background noise",
+            "people are hiccupping",
+            "the woman hiccups",
+            "A soft hiccup broke the silence momentarily",
+            "The repetitive sound of hiccups echoed lightly",
+            "Each hiccup was followed by a small pause",
+            "The hiccup sound was short and involuntary",
+            "A rhythmic series of hiccups could be heard faintly"
+        ]
+    },
+    {
+        "event": "hiss",
+        "phrases": [
+            "hiss",
+            "hisses",
+            "hissing",
+            "hissing spray",
+            "steam hiss",
+            "compressed air hisses",
+            "compressed air hissing",
+            "a quick powerful hiss",
+            "compressed air releasing",
+            "repetitive hissing",
+            "compressed air",
+            "A long hiss could be heard, sharp and steady",
+            "The hissing sound grew louder and more pronounced",
+            "A sudden hiss broke the stillness of the room",
+            "The hiss was continuous and high-pitched",
+            "A faint hissing noise persisted in the background"
+        ]
+    },
+    {
+        "event": "horse",
+        "phrases": [
+            "horses",
+            "a horse makes a sound",
+            "horse sounds",
+            "a horse's sounds",
+            "a horse slows and stops",
+            "horse walking",
+            "a horse trollops",
+            "horse noise",
+            "a horse speeds down a path",
+            "a series of horse sounds",
+            "horses sound",
+            "horse breaths",
+            "a horse stops",
+            "horse exhaling",
+            "horses walk",
+            "horses tap",
+            "a horse",
+            "horse noises",
+            "horse snorts",
+            "noise from a horse",
+            "horse running",
+            "the sounds of horses outside",
+            "hooves of horses",
+            "The horse's neigh resonated loudly",
+            "A soft snort from a horse broke the calm",
+            "The rhythmic clopping of hooves was unmistakable",
+            "A low whinny came from somewhere nearby",
+            "The horse's breath was audible as it exhaled deeply"
+        ]
+    },
+    {
+        "event": "howl",
+        "phrases": [
+            "howls",
+            "dog howling",
+            "a dog is howling",
+            "animal howling loudly",
+            "howling sound",
+            "many dogs howling at the same time",
+            "an animal howls",
+            "a dog is howling",
+            "dog howling sound",
+            "a loud howl",
+            "wolves howl",
+            "howling noises",
+            "A long, mournful howl echoed through the night",
+            "The howling sound was eerie and distant",
+            "A series of howls broke the otherwise quiet surroundings",
+            "The plaintive howl carried an air of loneliness",
+            "Howling noises seemed to come from all directions"
+        ]
+    },
+    {
+        "event": "idling",
+        "phrases": [
+            "idling",
+            "idling sounds",
+            "idling of engine",
+            "idling car",
+            "idling increases",
+            "idling engine",
+            "idling noise",
+            "idling vehicle motor",
+            "idling at a constant speed",
+            "an engine that is idling",
+            "idling of an engine",
+            "a machine resembling a vehicle makes sounds of an engine idling",
+            "idling car engine",
+            "an engine tapping and idling",
+            "idling vehicle",
+            "an idling machine",
+            "its engine idles",
+            "clicking from an idling engine",
+            "idling engines",
+            "The engine idled with a steady hum",
+            "A low idling sound filled the air",
+            "The idling engine produced a consistent, rhythmic noise",
+            "A faint idling sound could be heard in the background",
+            "The sound of the engine idling was calm and subdued"
+        ]
+    },
+    {
+        "event": "jackhammer",
+        "phrases": [
+            "jackhammer",
+            "jackhammering",
+            "mono jackhammer",
+            "a jackhammer pounds repeatedly",
+            "a jack-hammer is breaking up a concrete slab",
+            "a jackhammer runs",
+            "a jackhammer operating",
+            "a jackhammer in use",
+            "a jackhammer creates a loud and constant pounding noise",
+            "a jackhammer in operation",
+            "a jackhammer runs continously",
+            "a jackhammer operating then slowing down before operating at a normal rate again",
+            "someone is using a jackhammer to break concrete",
+            "the sound of a jackhammer",
+            "a jackhammer pounds",
+            "a jack-hammer is being used in a factory",
+            "workers are using a jackhammer",
+            "a jackhammer drilling and vibrating continuously",
+            "a jackhammer",
+            "a jackhammer operates",
+            "a jackhammer is loudly breaking concrete",
+            "a jackhammer operates with mechanical sounds",
+            "a man uses a jackhammer",
+            "a jackhammer pounds a hard surface",
+            "real sound of a jackhammer",
+            "a jackhammer runs continuously",
+            "machines hammer",
+            "a jackhammer drilling",
+            "The jackhammer's rapid pounding echoed loudly",
+            "A sharp, rhythmic jackhammer sound filled the air",
+            "The jackhammer noise was relentless and intense",
+            "The pounding of the jackhammer was unmistakable",
+            "The sound of the jackhammer carried through the surroundings"
+        ]
+    },
+    {
+        "event": "jet_engine",
+        "phrases": [
+            "a jet engine runs",
+            "aircraft jet engine",
+            "a jet engine roars",
+            "humming of a nearby jet engine",
+            "jet engine sounds",
+            "jet engine of an aircraft running",
+            "a jet engine runs and hisses",
+            "a jet engine hisses",
+            "jet engine",
+            "a jet engine whirs loudly",
+            "jet engine operating",
+            "a jet engine is operating and accelerating",
+            "jet engine flying",
+            "jet engine hums",
+            "steady jet engine running",
+            "a jet engine runs steadily",
+            "whooshing from a jet engine",
+            "a jet engine works nearby",
+            "a jet engine whirring sharply",
+            "a jet engine screams",
+            "The roar of a jet engine filled the atmosphere",
+            "A loud jet engine sound dominated the surroundings",
+            "The jet engine emitted a powerful, continuous noise",
+            "The sound of the jet engine grew louder as it neared",
+            "A deep, rumbling jet engine noise was audible"
+        ]
+    },
+    {
+        "event": "knock",
+        "phrases": [
+            "knock",
+            "knocks",
+            "knocking",
+            "knocking on door",
+            "knocking on a door",
+            "knock sounds",
+            "knocking on an office door",
+            "knocking on a window",
+            "door knocking",
+            "knocks on a wooden door",
+            "knocking in a wooden door",
+            "knocking on wooden door from inside",
+            "knocking on wood",
+            "pounding on a door",
+            "a series of knocks",
+            "knocking repeats numerous times",
+            "knocking on wood like a door",
+            "knocking on a glass surface",
+            "several knocks on a house door",
+            "knocking on a hollow wooden surface",
+            "a knocking on a door",
+            "a series of knocking",
+            "knocking repeatedly",
+            "a knocking door",
+            "a person knocks on a door",
+            "a knock",
+            "a series of door knocking",
+            "a knock on a door",
+            "knocking sounds",
+            "a loud knock on a door",
+            "A short knock echoed through the space",
+            "Knocking sounds came in a steady rhythm",
+            "The sharp knock was deliberate and clear",
+            "A faint knock could be heard from the distance",
+            "The knocking noise was distinct against the silence"
+        ]
+    },
+    {
+        "event": "laughter",
+        "phrases": [
+            "laughter",
+            "laugh",
+            "laughing",
+            "audience laughs",
+            "wicked laughter",
+            "laughter continues",
+            "audience laugh",
+            "laughs",
+            "audience laughing",
+            "short laughter",
+            "laughter from two or more persons",
+            "laughter on a good level",
+            "laughter from the man",
+            "the audience laughs",
+            "audience laughter",
+            "laughters",
+            "the audience laughing",
+            "people belly laugh",
+            "tap dancers laugh",
+            "a laughter",
+            "a laughter sample",
+            "belly laugh",
+            "a lot of belly laughter",
+            "canned laughter",
+            "an audience laughs",
+            "laughter from crowd",
+            "short laughter from an audience",
+            "The sound of laughter rang out joyfully",
+            "A burst of laughter broke the silence",
+            "The rhythmic sound of laughter filled the space",
+            "Laughter echoed, spreading a sense of cheer",
+            "A soft chuckle turned into hearty laughter"
+        ]
+    },
+    {
+        "event": "lawn_mower",
+        "phrases": [
+            "lawn mower",
+            "lawn mowers",
+            "lawnmower engine",
+            "lawn mowers start",
+            "lawn mowers run",
+            "the sounds of a lawn mower",
+            "lawn mowing",
+            "a lawn mower engine running then powering down",
+            "a lawn mower shifts gears and accelerates",
+            "lawn mower operates",
+            "lawn mower runs",
+            "a ride-on-lawnmower is being recorded",
+            "loud noise of turning on and off a lawnmower",
+            "lawn mower with engine starting",
+            "tree chipper sound effect",
+            "a lawnmower starts and stops",
+            "someone is cutting grass with an electric mower",
+            "a lawn mower is running with engine starting",
+            "a lawn mower runs and stops",
+            "a lawn mower operates",
+            "someone is cutting grass with a motor mower",
+            "lawn mower riding past a dog",
+            "a lawnmower putters out",
+            "the loud onset of a lawn mower engine idling",
+            "lawn mower sounds",
+            "lawnmower type engine sound that grows stronger then fades off",
+            "a lawnmower engine is started multiple times",
+            "a lawn mower engine running",
+            "a lawn mower running steadily for some time",
+            "The steady hum of the lawn mower filled the yard",
+            "A loud buzzing noise came from the spinning blades of the mower",
+            "The lawn mower's engine produced a rhythmic droning sound",
+            "The whirring of the mower grew louder as it passed nearby",
+            "A persistent mowing sound resonated throughout the area"
+        ]
+    },
+    {
+        "event": "liquid",
+        "phrases": [
+            "liquid",
+            "liquids pour",
+            "water pour",
+            "pouring liquids",
+            "a liquid",
+            "a liquid pours",
+            "liquid sounds",
+            "water",
+            "liquid is pouring into a pitcher",
+            "the sound of liquids",
+            "the sounds of liquid",
+            "liquids pour and splash",
+            "liquid pours",
+            "pouring liquid into container",
+            "water pouring sounds",
+            "liquids pouring",
+            "the sounds of liquid and water",
+            "spilling water",
+            "pouring liquid",
+            "liquid pouring",
+            "liquids are poured and slosh around",
+            "liquid filling sounds",
+            "someone is pouring a glass of water down a sink",
+            "liquid is trickled and dribbled into water",
+            "pours liquid",
+            "liquid is being poured into another liquid",
+            "water being poured into a pitcher",
+            "someone is pouring water",
+            "The liquid splashed softly as it was poured",
+            "A faint dripping sound signaled the presence of liquid",
+            "The gurgling of liquid could be heard as it flowed steadily",
+            "The bubbling sound of liquid indicated gentle motion",
+            "A sloshing noise came from the container as the liquid shifted"
+        ]
+    },
+    {
+        "event": "machine_gun",
+        "phrases": [
+            "machine guns sound",
+            "machine gun sound",
+            "sound of a machine gun",
+            "machine gun sounds",
+            "machine gunfire sound",
+            "machine gun fires",
+            "machine guns fire",
+            "a short series of machine gunfire",
+            "machine gunshots sound",
+            "machine gun fire occurs",
+            "a machine gun is fired in short bursts",
+            "fires a machine gun",
+            "machine gun shooting",
+            "machine gun fires",
+            "machine guns shot sound",
+            "machine gunfire rings out",
+            "machine gun violence",
+            "loud machine gun sound",
+            "a light machine gun is firing a single burst",
+            "light machine gun sound",
+            "a machine gun is fired",
+            "machine guns are operating",
+            "powerful machine gun is being fired",
+            "machine gun fire rings out",
+            "Rapid machine gun fire echoed sharply",
+            "The staccato burst of machine gun bullets was unmistakable",
+            "A steady rattle of machine gun fire filled the air",
+            "The loud, repetitive sound of a machine gun dominated the scene",
+            "The sharp report of a machine gun firing in rapid succession was startling"
+        ]
+    },
+    {
+        "event": "male_singing",
+        "phrases": [
+            "a male voice sings",
+            "a man sings a mantra",
+            "a man sings a song",
+            "a male singer sings",
+            "a male singer sing",
+            "male voices sing a song",
+            "a male singer performs",
+            "a male voice sings multiple times",
+            "a male sing",
+            "a male voice sings in a repeating sequence",
+            "a male sings",
+            "singing a line",
+            "singing with a male voice",
+            "singing by a male",
+            "an adult male sings",
+            "singing from a male",
+            "a male singing voice heard intermittently",
+            "a man is singing a song",
+            "a male singer",
+            "a male singing voice",
+            "a male singer is performing",
+            "a man sings continuously throughout the track",
+            "a young adult male sings",
+            "male singer performs",
+            "singing with male vocals",
+            "a male sings intermittently",
+            "a male voice chanting",
+            "a man sings multiple songs",
+            "a man take turns singing",
+            "The deep tones of male singing resonated warmly",
+            "A melody sung by a male voice carried through the air",
+            "The rich timbre of the male singing voice was captivating",
+            "The male voice sang with a clear and resonant tone",
+            "A harmonious male vocal performance filled the space"
+        ]
+    },
+    {
+        "event": "mechanical_bell",
+        "phrases": [
+            "a mechanical bell ringer",
+            "a mechanical bell",
+            "a mechanical bell rings",
+            "a mechanical bell is ringing",
+            "mechanical bells",
+            "mechanical bell sound",
+            "The mechanical bell rang with a clear, metallic tone",
+            "A steady ringing sound came from the mechanical bell",
+            "The distinct chime of the mechanical bell was heard",
+            "The rhythmic clanging of the mechanical bell filled the air",
+            "A sharp, repeated dinging sound indicated the mechanical bell"
+        ]
+    },
+    {
+        "event": "mechanical_fan",
+        "phrases": [
+            "a mechanical fan blowing",
+            "a mechanical fan",
+            "mechanical fan",
+            "fan",
+            "a mechanical fan runs",
+            "mechanical fan noise",
+            "fan sounds",
+            "The mechanical fan emitted a consistent whirring noise",
+            "A soft humming sound came from the rotating blades of the fan",
+            "The fan's motor produced a steady droning sound",
+            "The rhythmic whooshing of air from the fan was audible",
+            "A faint mechanical hum indicated the operation of the fan"
+        ]
+    },
+    {
+        "event": "medium_engine_(mid_frequency)",
+        "phrases": [
+            "medium engine sound",
+            "mid-frequency medium engine",
+            "noise coming from a medium engine",
+            "a medium engine runs",
+            "a medium engine revving",
+            "quick revving of a medium engine",
+            "a medium engine hums",
+            "a medium engine accelerating",
+            "a medium engine making revving sounds",
+            "a medium engine roars",
+            "a medium engine revs and squeals",
+            "a medium engine sound",
+            "a medium engine revs and accelerates",
+            "medium engine noise",
+            "a medium engine makes noise",
+            "a medium engine is making mid frequency sounds",
+            "a medium engine sounds",
+            "a medium engine revs",
+            "the sound of a medium engine",
+            "The engine emitted a steady mid-frequency hum",
+            "A moderate droning sound came from the engine at idle",
+            "The medium engine produced a deep, consistent noise",
+            "A rhythmic purring noise was heard from the engine",
+            "The engine's sound was neither too high nor too low in pitch"
+        ]
+    },
+    {
+        "event": "meow",
+        "phrases": [
+            "meow",
+            "meows",
+            "cat meow",
+            "a loud meow",
+            "meowing sound",
+            "meowing",
+            "a meow",
+            "a series of meows",
+            "a cat meows three times",
+            "a series of meowing",
+            "cat meowing 3 times",
+            "a pet cat meows two times",
+            "cat sounds",
+            "a cat meow sound",
+            "three short meows",
+            "the meow of a cat",
+            "a cat makes an anguished meow",
+            "a cat meows sharply",
+            "kitten meowing",
+            "a cat meows alternately",
+            "a cat meows loudly two times",
+            "a car meows",
+            "a cat continuously meows",
+            "cat meowing",
+            "a cat meows angrily",
+            "A cat's meow echoed softly",
+            "The meowing sound was tonal and clear",
+            "A plaintive meow was heard nearby",
+            "The cat's meow was short and distinct",
+            "A series of soft meows indicated the presence of a cat"
+        ]
+    },
+    {
+        "event": "mosquito",
+        "phrases": [
+            "mosquito",
+            "mosquito is buzzing close up",
+            "a mosquito sound created with a synth",
+            "a mosquito buzzing",
+            "an insect buzzing at a high pitch tone continuously",
+            "a mosquito buzz",
+            "a housefly buzzes by loudly nearby",
+            "a mosquito buzzes",
+            "the humming of a mosquito",
+            "random buzzing of an insect varying in loudness",
+            "buzzing mosquito flying",
+            "the sound of a mosquito",
+            "mosquitos",
+            "a mosquito",
+            "a mosquito buzzes in the background",
+            "mosquitoes buzz in the foreground",
+            "a mosquito buzzing in the background",
+            "the buzzing of a flying mosquito",
+            "a mosquito flying",
+            "a mosquito buzzes nearby",
+            "an mosquito buzzes around continuously",
+            "a mosquito chirping in the background",
+            "a synthetic sound of buzzing mosquitos or bees",
+            "a mosquito sound",
+            "The high-pitched whine of a mosquito was audible",
+            "A faint buzzing noise indicated the presence of a mosquito",
+            "The mosquito's sound was sharp and persistent",
+            "A buzzing sound hovered nearby, characteristic of a mosquito",
+            "The mosquito's whine grew louder as it approached"
+        ]
+    },
+    {
+        "event": "motorboat",
+        "phrases": [
+            "motorboat",
+            "speedboat",
+            "motorboats",
+            "motorboat motor",
+            "the sounds of a motorboat",
+            "electronic motorboat engine",
+            "motorboat engine",
+            "a motorboat engine is running loud and fast",
+            "the boat motor advances",
+            "a motorboat",
+            "a motorboat engine runs continuously",
+            "the sound of a motorboat",
+            "motorboats rev up rapidly",
+            "a motorboat engine reduces to a slower and quieter pace",
+            "a motorboat engine revving continuously",
+            "engine boat revving",
+            "a motorboat's sounds",
+            "a speedboat engine running on and off",
+            "boat motor",
+            "speedboat engine run loudly",
+            "a boat motor is running with increasing frequency",
+            "motorboat engine acceleration",
+            "a motor revs loudly and then decreases",
+            "a motorboat speeds up even more",
+            "a motorboat engine vibrates loudly nearby",
+            "The motorboat's engine produced a steady droning noise",
+            "A rhythmic chugging sound came from the motorboat",
+            "The motorboat's engine roared as it accelerated",
+            "A deep rumbling noise was heard from the motorboat",
+            "The motorboat's sound was distinct and mechanical"
+        ]
+    },
+    {
+        "event": "motorcycle",
+        "phrases": [
+            "motorcycle",
+            "motorcycle engine",
+            "motorcycles rev",
+            "motorcycle revving",
+            "motorcycle vehicle revving down",
+            "motorcycle engine revving up",
+            "motorcycle reving loudly",
+            "motorcycle revving engine",
+            "motorcycle engine revving",
+            "a motorcycle revs up",
+            "motorcycle pass",
+            "motorcycle revs quickly",
+            "a motorcycle engine accelerates quickly",
+            "motorcycle engines accelerate",
+            "motorcycles acceleration",
+            "a motorcycle engine revs up",
+            "motorcycles accelerate",
+            "revving sounds of a motorcycle",
+            "a motorcycle quickly accelerates",
+            "motorcycle engine accelerates",
+            "a motorcycle engine revving sharply",
+            "motorcycles",
+            "a motorcycle revs up loudly",
+            "motorcycle engines rev",
+            "revving motorcycles",
+            "loud accelerating motorcycle",
+            "motor bike engine revving",
+            "motor revving",
+            "a motorcycle engine decelerates",
+            "a motorcycle engine roars",
+            "The motorcycle's engine emitted a sharp roaring noise",
+            "A steady revving sound came from the motorcycle",
+            "The motorcycle's engine produced a mid-frequency drone",
+            "A rhythmic rumble indicated the presence of a motorcycle",
+            "The motorcycle's sound grew louder as it passed by"
+        ]
+    },
+    {
+        "event": "music",
+        "phrases": [
+            "music",
+            "playing music",
+            "the music",
+            "a variety of music",
+            "a mix of music",
+            "the playing of music",
+            "background of music",
+            "a music playing",
+            "a background of music",
+            "a music soundtrack",
+            "a music",
+            "a group of people listen to music",
+            "a music track plays",
+            "piece of music",
+            "a mixture of music",
+            "a piece of music",
+            "a piece of music plays",
+            "music playing",
+            "the sound of music",
+            "a music played",
+            "a musical track",
+            "music in play",
+            "music in the background",
+            "A melodic tune resonated beautifully",
+            "The rhythmic sound of music filled the air",
+            "Harmonious notes blended seamlessly in the music",
+            "The music's sound was soothing and pleasant",
+            "An instrumental melody played softly in the background"
+        ]
+    },
+    {
+        "event": "neigh",
+        "phrases": [
+            "neigh",
+            "a horse neighs",
+            "horses neigh",
+            "horse neighing",
+            "a horse neighs nearby",
+            "a horse neighs loudly",
+            "a horse neighs wildly",
+            "horses are neighing",
+            "a horse letting out a neigh",
+            "a neigh",
+            "horse neighing sound",
+            "A clear neigh echoed through the surroundings",
+            "The horse's high-pitched neigh broke the silence",
+            "A series of short neighs was heard nearby",
+            "The neighing sound carried a sense of urgency",
+            "A loud neigh resonated across the field"
+        ]
+    },
+    {
+        "event": "ocean",
+        "phrases": [
+            "ocean",
+            "surf waves",
+            "ocean waves",
+            "the ocean",
+            "seawash",
+            "surf",
+            "waves (surf)",
+            "ocean waves ebb and flow",
+            "the waves",
+            "waves",
+            "an ocean",
+            "the sounds of the ocean",
+            "the sounds of surf",
+            "ocean currents",
+            "ocean waves repeatedly crash",
+            "ocean waves are moving at a moderate pace",
+            "ocean waves break",
+            "waves break",
+            "ocean waves are breaking and crashing onto shore",
+            "the sounds of waves",
+            "ocean waves are repeatedly splashing on shore",
+            "the ocean waves",
+            "waves crashing onto shore continuously",
+            "the ocean waves are hitting the shore at a moderate pace",
+            "waves are continuously washing onto shore",
+            "waves crashing continuously onto shore",
+            "the sound of the ocean waves",
+            "close ocean waves",
+            "ocean ambience",
+            "The soothing sound of ocean waves was constant",
+            "A rhythmic crashing of waves echoed along the shore",
+            "The gentle lapping of water indicated a calm ocean",
+            "A deep, rolling wave sound was heard from the ocean",
+            "The ocean's sound was vast and ever-present"
+        ]
+    },
+    {
+        "event": "oink",
+        "phrases": [
+            "oink",
+            "oinking",
+            "an oink",
+            "oinks",
+            "animal sounds such as oink, oink",
+            "brief oinking",
+            "animal oinking",
+            "a pig oinks",
+            "pig oinking",
+            "a pig making oink",
+            "animal oink sounds",
+            "rapid oinking",
+            "oinking pig",
+            "rhythmic oinking",
+            "a pig oinks",
+            "small oinks from a pig",
+            "a pig oinking",
+            "a pig oink sound",
+            "a pig oink",
+            "an animal oinks",
+            "a pig makes oink sounds",
+            "a pig is making an oinking sound",
+            "A loud oink broke the silence",
+            "The characteristic oinking of a pig was heard",
+            "A rhythmic series of oinks came from nearby",
+            "The deep oink carried a sense of contentment",
+            "A soft, short oink was heard intermittently"
+        ]
+    },
+    {
+        "event": "owl",
+        "phrases": [
+            "owl hooting",
+            "owl sound",
+            "an owl hooting",
+            "an owl sound effect",
+            "an owl is being recorded",
+            "a hoot owl making a sound",
+            "an owl hoots",
+            "an owl vocalizes",
+            "an owl whistles",
+            "a whistling owl calls out",
+            "an owl is making hooting sounds",
+            "an owl sound effect",
+            "A soft hooting sound came from an owl",
+            "The owl's call echoed eerily in the night",
+            "A rhythmic hoot was heard repeatedly",
+            "A low, resonant hoot indicated an owl nearby",
+            "The owl's sound was distinct against the silence"
+        ]
+    },
+    {
+        "event": "paper_rustling",
+        "phrases": [
+            "papers rustling",
+            "paper rustling sounds",
+            "sounds of paper rustling",
+            "rustling sound",
+            "rustle paper",
+            "paper sounds",
+            "paper rustling and crumpling",
+            "paper rustle",
+            "The soft rustling of paper was audible",
+            "A faint crinkling noise came from handling the paper",
+            "The sound of paper rustling was brief but clear",
+            "A gentle rustle indicated someone turning pages",
+            "The crisp sound of paper being folded could be heard"
+        ]
+    },
+    {
+        "event": "pig",
+        "phrases": [
+            "pig oinks",
+            "pigs oink",
+            "pigs oinks",
+            "pig oinking",
+            "pigs",
+            "pig grunts",
+            "pigs grunt",
+            "pigs oinking",
+            "pigs crow",
+            "a pig sound",
+            "pigs intermittently oinking",
+            "pigs squeal",
+            "A loud grunt came from the pig",
+            "The pig's snorting was steady and rhythmic",
+            "A series of low grunting noises indicated the pig's presence",
+            "The pig's vocalizations were deep and guttural",
+            "A soft, contented snuffle was heard from the pig"
+        ]
+    },
+    {
+        "event": "plop",
+        "phrases": [
+            "plop",
+            "plop plop sound",
+            "plops",
+            "plopping sounds",
+            "a plop noise",
+            "a plopping sound effect",
+            "a single plop",
+            "A small plop was heard as something dropped into water",
+            "The plopping sound was soft and quick",
+            "A faint plop echoed briefly in the stillness",
+            "The sound of a plop indicated a small object entering liquid",
+            "A single plop broke the silence momentarily"
+        ]
+    },
+    {
+        "event": "police_car_(siren)",
+        "phrases": [
+            "a police car siren blares, then stops, and afterward blares again",
+            "a police siren wails once before fading in the distance",
+            "an police car siren",
+            "police car sirens ring in rapid succession",
+            "an police car blares its siren",
+            "a police car siren goes off and continues awhile",
+            "a police car siren sounds and then stops and restarts again",
+            "police car sirens blaring in succession",
+            "police car sirens blare in a series",
+            "a couple of police car sirens blaring one after the other",
+            "an police cariren wails in different patterns",
+            "police car sirens ring as they approach",
+            "a police siren rings in different patterns",
+            "continuous police car siren becoming rapid",
+            "a police car siren sounds",
+            "an police car siren sounding off continuously",
+            "an police vehicle siren wails and echoes",
+            "a police car siren sounds in different patterns continuously",
+            "The wailing siren of a police car grew louder",
+            "A sharp, oscillating siren sound filled the air",
+            "The police car's siren echoed through the streets",
+            "A high-pitched siren noise was unmistakable",
+            "The sound of the police siren faded into the distance"
+        ]
+    },
+    {
+        "event": "power_saw",
+        "phrases": [
+            "circular saw",
+            "power saw",
+            "power saws",
+            "table saw",
+            "circular saw is being recorded",
+            "a circular saw in operation",
+            "a circular saw is being used",
+            "a circular saw runs",
+            "electric saw cutting",
+            "a power saw running",
+            "a power saw cutting some objects",
+            "a power saw makes a cutting sound",
+            "sound of table saw",
+            "a power saw cuts an object",
+            "a loud power sawing",
+            "a power saw turns on and runs",
+            "a power saw runs",
+            "power saw sounds",
+            "mechanical saw sawing",
+            "sound of a power saw",
+            "a power saw cutting",
+            "a man uses a power saw",
+            "large power sawing",
+            "power tools saw",
+            "a power saw cuts",
+            "The sharp buzzing of a power saw was heard cutting through material",
+            "A high-pitched whirring noise came from the power saw",
+            "The power saw's motor emitted a steady droning sound",
+            "The power saw produced a harsh grinding noise as it worked",
+            "A rhythmic sawing sound indicated continuous operation of the power saw"
+        ]
+    },
+    {
+        "event": "power_tool",
+        "phrases": [
+            "power tool sound",
+            "a soft power tool drilling",
+            "power tools run",
+            "a man is using a power tool",
+            "a power tool making drilling noises",
+            "vibrations from a power tool",
+            "a power tool running",
+            "ringing of a power tool",
+            "the sound of a power tool spinning",
+            "a power tool runs continuously",
+            "power tools drill through materials",
+            "power tools make noise",
+            "power tools buzz",
+            "power tool sounds",
+            "power tools are at work",
+            "power tools are being used",
+            "a power tool buzzes",
+            "power tools cut",
+            "The power tool emitted a loud, mechanical buzzing sound",
+            "A steady whir came from the power tool as it operated",
+            "The motorized tool produced a rhythmic grinding noise",
+            "A high-pitched hum indicated the use of the power tool",
+            "The sound of the power tool was sharp and mechanical"
+        ]
+    },
+    {
+        "event": "printer",
+        "phrases": [
+            "printer",
+            "printer is printing a document",
+            "printer is printing out a receipt",
+            "printers",
+            "printer is printing a piece of paper",
+            "printers print continuously",
+            "a printer prints",
+            "printer turning on",
+            "a printer is printing a receipt",
+            "an operating printer",
+            "a home printer",
+            "a printer printing",
+            "a desktop printer operating",
+            "printer hum",
+            "a printer operates",
+            "printers are printing with mechanisms sounds",
+            "a printer is turning on",
+            "a printer",
+            "a printer is printing out a receipt",
+            "a 3d printer is printing",
+            "a printer runs",
+            "a printer works",
+            "a printer mechanism",
+            "printer noise",
+            "gears operating on a printer",
+            "a 3d printer is in action",
+            "printer mechanisms",
+            "the sounds of a printer",
+            "a printer is scanning a book",
+            "a printer being turned on",
+            "The printer emitted a rhythmic whirring and clicking sound",
+            "A steady hum accompanied the operation of the printer",
+            "The printer's motor produced a faint, mechanical droning noise",
+            "A sequence of beeps and printing noises indicated activity",
+            "The sound of paper feeding added to the printer's operation noise"
+        ]
+    },
+    {
+        "event": "propeller",
+        "phrases": [
+            "propeller",
+            "propellers",
+            "propeller noise",
+            "propeller sounds",
+            "propellers spin",
+            "prop engine sound",
+            "a propeller sound",
+            "propellers twirling",
+            "air propellers",
+            "propeller create wind sounds",
+            "propellers rotating",
+            "propeller make sounds",
+            "propeller sounds get loud as it comes close",
+            "propellers air and buzz",
+            "drone propellers",
+            "propeller blades",
+            "propellers are spinning",
+            "propeller airscrew sounds",
+            "a propeller is running",
+            "The propeller's rhythmic chopping sound filled the air",
+            "A steady hum came from the spinning propeller",
+            "The propeller noise grew louder as the blades spun faster",
+            "The sound of the propeller was deep and mechanical",
+            "A distinct whirring noise indicated the operation of the propeller"
+        ]
+    },
+    {
+        "event": "quack",
+        "phrases": [
+            "quack",
+            "a duck quacks",
+            "artificial duck quacks",
+            "the sound of a duck quacking",
+            "the sound of a quacking duck",
+            "a duck quacks rhythmically",
+            "quacking",
+            "a duck quacks in rapid succession",
+            "a duck quacks loudly nearby multiple times",
+            "duck quacks",
+            "duck quaking loudly",
+            "quacks",
+            "a duck quacks loudly and continuously",
+            "duck quack",
+            "a duck quacking repeatedly without breaks",
+            "a single quack",
+            "a duck quacks first moderately and then vigorously",
+            "duck quaking",
+            "quacking duck",
+            "a duck repeatedly quacks loudly",
+            "a duck quacks continuously",
+            "duck quacking loudly",
+            "a duck quacking several times",
+            "duck quacking",
+            "a mother duck quacks",
+            "a duck quacks rapidly",
+            "a duck quacks many times",
+            "a duck quacks multiple times",
+            "loud and rapid quacking",
+            "a duck quacking continuously at consistent intervals",
+            "A loud quack broke the silence",
+            "The characteristic quacking noise of a duck was heard nearby",
+            "A rhythmic series of quacks echoed across the water",
+            "The duck's quack sounded sharp and distinct",
+            "A soft quack was heard intermittently"
+        ]
+    },
+    {
+        "event": "race_car",
+        "phrases": [
+            "race car engines",
+            "race car running",
+            "race car noise",
+            "race car",
+            "race car speeding off",
+            "race car sounds",
+            "car engine sound in car race",
+            "a race car runs",
+            "sounds of a car race",
+            "race car engine revs",
+            "auto racing",
+            "auto racing sound",
+            "a race car loudly accelerate outside",
+            "auto racing running",
+            "a race car accelerates loudly",
+            "an auto racing passing by",
+            "The roar of a race car engine echoed loudly",
+            "A sharp revving sound came from the race car",
+            "The race car's engine produced a deep, powerful noise",
+            "A high-pitched whine accompanied the acceleration of the race car",
+            "The sound of the race car was intense and relentless"
+        ]
+    },
+    {
+        "event": "rain",
+        "phrases": [
+            "rain",
+            "raining",
+            "rainfall",
+            "rain falling",
+            "rain fall",
+            "rain falls",
+            "rain on surface",
+            "rain falls steadily",
+            "rain falling onto a hard surface",
+            "raining hard",
+            "rain falls on surface",
+            "rain steadily falls",
+            "rain falling heavily",
+            "rain is falling and spattering on a surface",
+            "rain falling heavily on a surface",
+            "rain is falling hard on a tile floor",
+            "rain falls onto a surface",
+            "rain falls onto a street",
+            "rain falls on a surface very thickly nearby",
+            "rain falling on the surface",
+            "rain falls onto a hard surface heavily",
+            "rain falling and dropping on a surface",
+            "rain fall heavily",
+            "rain on surfaces",
+            "rain falls steadily onto a hard surface",
+            "rain falls loudly and rapidly on a surface",
+            "rain falling hard",
+            "rain falling on a surface",
+            "rain fall onto a hard surface",
+            "rain is falling and pattering on a hard surface",
+            "The gentle patter of rain was soothing to hear",
+            "A steady rain sound filled the environment",
+            "The rhythmic dripping of rain was audible",
+            "A soft rainfall sound created a calming atmosphere",
+            "The rain's sound was consistent and natural"
+        ]
+    },
+    {
+        "event": "rain_on_surface",
+        "phrases": [
+            "pitter-patter of rain",
+            "rain on surface",
+            "rain",
+            "rain is falling hard on a tile floor",
+            "rain falls heavily onto a hard surface",
+            "rain is falling and spattering on a surface",
+            "rain falls onto a hard surface heavily",
+            "rain falling heavily on a surface",
+            "rain falling onto a hard surface",
+            "rain is falling and pattering on a hard surface",
+            "rain falls heavily on a surface",
+            "rain fall onto a hard surface",
+            "rain falls down loudly on a surface",
+            "rain falling on a hard surface",
+            "a strong rainfall on a hard surface",
+            "rain pours heavily on a surface",
+            "rain falls rapidly on a surface",
+            "rain falls down rapidly",
+            "rain fall",
+            "rain falls on a surface rapidly nearby",
+            "rain taps",
+            "rain is falling on a surface hard",
+            "rain is falling and hitting surfaces",
+            "raindrops pitter-patter",
+            "rain falls on a surface very thickly nearby",
+            "rain is falling hard on a surface",
+            "rain is falling very very hard onto a surface",
+            "rain falls very loudly on a surface",
+            "rain on surfaces",
+            "raining hard",
+            "The steady sound of rain striking a roof was constant",
+            "A rhythmic tapping noise came from rain hitting a window",
+            "The rain on the surface created a soft splattering sound",
+            "A persistent pattering of rain was heard on the ground",
+            "The sound of rain on the surface was soothing and steady"
+        ]
+    },
+    {
+        "event": "rattle",
+        "phrases": [
+            "rattle",
+            "rattle sounds",
+            "rattles shake",
+            "a shake",
+            "a fast rattle",
+            "a rattle sounds",
+            "vibrations rattle",
+            "a rattle shakes",
+            "rattle sounds are heard intermittently",
+            "a rattle noise",
+            "A rapid rattling noise came from loose objects",
+            "The sound of small items rattling was sharp and consistent",
+            "A faint rattle echoed from within a container",
+            "The rattling noise was abrupt and repetitive",
+            "A series of clattering sounds indicated movement"
+        ]
+    },
+    {
+        "event": "reversing_beeps",
+        "phrases": [
+            "reversing beeps are heard",
+            "reversing beeps sound",
+            "reversing beeps are processed",
+            "reversing beeps",
+            "reversing beeps occur in a short series",
+            "reversing beep sounds",
+            "sound of reversing beeps",
+            "reversing beeps in a mechanical setting",
+            "beeping sounds repeat",
+            "a reversing beeps loudly nearby several times",
+            "beeping inside a room",
+            "The reversing beep of a vehicle was steady and rhythmic",
+            "A sharp beeping sound indicated a vehicle in reverse",
+            "The warning beeps grew louder as the vehicle reversed",
+            "A repetitive beep alerted pedestrians to a reversing vehicle",
+            "The sound of reversing beeps was mechanical and consistent"
+        ]
+    },
+    {
+        "event": "ringing_tone",
+        "phrases": [
+            "ringing sound",
+            "ringing tones",
+            "bell tone",
+            "ringtone sound",
+            "ringing sound",
+            "ringback tone",
+            "ringing reverb",
+            "bell sound",
+            "boing sound",
+            "A clear ringing tone sounded from a nearby phone",
+            "The phone's ringing tone was sharp and attention-grabbing",
+            "A steady beeping noise indicated an incoming call",
+            "The sound of a ringing tone echoed in the room",
+            "The synthesized ringing sound was electronic and distinct"
+        ]
+    },
+    {
+        "event": "sanding",
+        "phrases": [
+            "sanding",
+            "sanding a wooden surface",
+            "sanding wood",
+            "sanding a solid object",
+            "sanding on wood",
+            "sanding and rubbing",
+            "brushing",
+            "dry sanding",
+            "wood sanding",
+            "sanding and filing",
+            "someone is sanding",
+            "rub sanding",
+            "someone is sanding a piece of wood",
+            "a series of sanding",
+            "a person sands an object",
+            "wood brushing",
+            "the sounds of sanding",
+            "sanding of wood",
+            "wood being sanded",
+            "some sanding",
+            "The sound of sanding was rough and consistent",
+            "A rhythmic scraping noise came from the sanding process",
+            "The sanding sound grew softer as the surface smoothed out",
+            "The abrasive sound of sanding was sharp and repetitive",
+            "A faint scratching noise was heard during sanding"
+        ]
+    },
+    {
+        "event": "sawing",
+        "phrases": [
+            "sawing",
+            "sawing repeats",
+            "sawing wood",
+            "sawing a plastic surface",
+            "a saw sawing",
+            "sawing noises",
+            "sawing a solid object",
+            "sawing of wood",
+            "sawing occurs",
+            "a person is sawing a solid object",
+            "a saw sawing wood",
+            "sawing and wood sounds",
+            "industrial saw sawing wood",
+            "wooden sawing",
+            "a saw is used on wood",
+            "sawing something",
+            "a solid object is sawed",
+            "metal sawing wood",
+            "a person is sawing an object",
+            "sawing of wood products",
+            "sawing of wood with a hand saw",
+            "a man uses a saw to cut a solid object",
+            "sawing of wood is occurring",
+            "rhythmic metal sawing",
+            "a tool sawing wood",
+            "wood sawing",
+            "a saw cutting a solid object",
+            "a saw cutting an object",
+            "sawing a bamboo stick",
+            "a person saws an object",
+            "The sawing sound was rhythmic and sharp",
+            "A steady rasping noise came from the saw cutting through material",
+            "The sound of sawing grew louder as the blade moved faster",
+            "A high-pitched noise indicated a motorized saw in use",
+            "The sawing sound was rough and mechanical"
+        ]
+    },
+    {
+        "event": "scrape",
+        "phrases": [
+            "scrape",
+            "scratch",
+            "a scraping",
+            "the sounds of scraping",
+            "scraping with surface contact",
+            "a scrape sound",
+            "a rub",
+            "rough scraping",
+            "a scrape",
+            "A sharp scraping sound was heard as the surface was scratched",
+            "The sound of scraping was harsh and repetitive",
+            "A faint scraping noise came from a distant source",
+            "The scraping sound grew louder as the edge moved across the surface",
+            "The noise of scraping was abrasive and mechanical"
+        ]
+    },
+    {
+        "event": "screaming",
+        "phrases": [
+            "screaming",
+            "screams",
+            "wild screaming",
+            "painful screams",
+            "screams loudly",
+            "a person screaming in terror",
+            "a constant screaming",
+            "a kid screaming ",
+            "people are screaming",
+            "a group of people are screaming wildly",
+            "people scream in fear",
+            "she screams",
+            "someone is desperately screaming",
+            "a loud screaming",
+            "human screaming",
+            "a woman is screaming in terror",
+            "woman screaming",
+            "the adult female screams",
+            "males scream",
+            "people screaming",
+            "young child is sustained screaming",
+            "women screaming",
+            "human screams",
+            "a girl screaming in a soundstage",
+            "a baby screaming in the foreground",
+            "an adult male screams",
+            "screaming babies",
+            "females scream",
+            "male screaming",
+            "A loud scream pierced the air",
+            "The sound of screaming was sharp and high-pitched",
+            "A series of screams echoed in the distance",
+            "The scream was sudden and startling",
+            "A prolonged scream carried a sense of urgency"
+        ]
+    },
+    {
+        "event": "sewing_machine",
+        "phrases": [
+            "sewing machines",
+            "sewing machine",
+            "sewing machine stitches",
+            "sewing machine stitching",
+            "sewing machine operating",
+            "sewing machine running",
+            "a sewing machine returns to rapid sewing",
+            "a sewing machine works at regular intervals",
+            "a woman taps and clicks with a sewing machine",
+            "sewing machine being used",
+            "sewing machine working",
+            "a sewing machine clinks repetitively before stopping",
+            "a sewing machine being used",
+            "sewing machine running",
+            "sewing machine mechanisms",
+            "a sewing machine operates several times",
+            "sewing machine going",
+            "sewing machines operate intermittently",
+            "sewing machine runs",
+            "a sewing machine is used with sewing sounds",
+            "the sounds of a sewing machine",
+            "a sewing machine is being use",
+            "sewing machine clacking",
+            "a sewing machine operates",
+            "a sewing machine is making sound",
+            "short bursts of sewing",
+            "The sewing machine emitted a steady whirring noise",
+            "A rhythmic clicking sound came from the sewing machine",
+            "The motor of the sewing machine produced a faint hum",
+            "The sound of the sewing machine was quick and mechanical",
+            "A soft clattering noise indicated the machine was in use"
+        ]
+    },
+    {
+        "event": "sheep",
+        "phrases": [
+            "sheep",
+            "sheep bleat",
+            "a young sheep bleats",
+            "sheep baah",
+            "sheep baa",
+            "a sheep bleats multiple times",
+            "a number of sheep bleating continuously",
+            "sheep baaing",
+            "several sheep bleating unceasingly",
+            "older sheep bleating",
+            "a sheep bleats nearby multiple times",
+            "young sheep baa",
+            "the sounds of sheep",
+            "a sheep bleets",
+            "sheeps bleat",
+            "sheep bleats",
+            "a sheep bleats",
+            "a sheep goes baa",
+            "sheep grunt",
+            "the sheep bleat",
+            "a sheep beys",
+            "sheep scream",
+            "a sheep bleats a couple of times",
+            "a sheep sporadically bleating",
+            "sheep bleating sporadically",
+            "several sheep bah",
+            "a sheep baas",
+            "A loud bleat came from a sheep",
+            "The sheep's bleating was rhythmic and persistent",
+            "A series of low bleating noises indicated a flock nearby",
+            "The sound of the sheep was soft and distinct",
+            "A faint bleat echoed in the distance"
+        ]
+    },
+    {
+        "event": "shout",
+        "phrases": [
+            "shout",
+            "people shout",
+            "shouts",
+            "crowd shouting",
+            "a crowd of people shouting",
+            "several people shout",
+            "yelling",
+            "shouting",
+            "a man briefly shouting",
+            "crowd yelling",
+            "yells",
+            "shout sounds",
+            "a guy shouts",
+            "a man yelling",
+            "children are yelling in unison",
+            "people are shouting",
+            "the sounds of shouting",
+            "a man yelling",
+            "someone shouting out",
+            "a group of people erupt with shouts",
+            "someone is yelling",
+            "a loud scream",
+            "the male screams",
+            "A loud shout carried over the noise",
+            "The sound of shouting was clear and deliberate",
+            "A sharp shout broke the silence",
+            "The shouting noise was steady and commanding",
+            "A distant shout could be faintly heard"
+        ]
+    },
+    {
+        "event": "shower",
+        "phrases": [
+            "shower",
+            "shower water",
+            "showering",
+            "bathtub sounds",
+            "shower sounds",
+            "water tap sound",
+            "shower running",
+            "showers",
+            "water runs into a shower",
+            "a shower",
+            "face washing",
+            "a shower is running loudly",
+            "shower is running inside a bathroom",
+            "someone is turning a shower on",
+            "bathroom shower is spraying water",
+            "someone is using a shower",
+            "The steady sound of water spraying from the shower was soothing",
+            "A rhythmic splashing noise came from the running shower",
+            "The sound of water hitting the shower floor was distinct",
+            "A soft hissing noise indicated the spray of the shower",
+            "The shower's noise was consistent and calming"
+        ]
+    },
+    {
+        "event": "shuffling_cards",
+        "phrases": [
+            "playing cards are being riffled and modified",
+            "people are shuffling cards inside a small room",
+            "shuffling cards sounds",
+            "playing cards are being ruffled through",
+            "cards shuffling",
+            "shuffling cards",
+            "cards are being shuffled",
+            "someone shuffles a deck of cards",
+            "cards shuffling on a surface",
+            "cards are shuffling on a hard surface",
+            "card shuffling sounds",
+            "someone is shuffling a deck of cards",
+            "someone is shuffling playing cards",
+            "shuffling card sounds",
+            "cards are being riffle shuffled",
+            "paper shuffles",
+            "people shuffle cards in a small room",
+            "cards are shuffled repeatedly",
+            "cards are being shuffled with the riffle shuffle method",
+            "people shuffle cards",
+            "a deck of playing cards is being shuffled",
+            "The sound of shuffling cards was quick and rhythmic",
+            "A soft rustling noise came from the deck being shuffled",
+            "The cards made a faint clicking sound as they were shuffled",
+            "A rhythmic series of card noises indicated shuffling",
+            "The shuffling sound was soft but distinct"
+        ]
+    },
+    {
+        "event": "sigh",
+        "phrases": [
+            "a heavy sigh",
+            "an emotional sigh",
+            "sighing",
+            "a long sigh",
+            "a human sigh sound",
+            "sighing sounds",
+            "a sigh in the background",
+            "a sigh",
+            "a young male sighs",
+            "sighs",
+            "a female sigh",
+            "a person sighing",
+            "A soft sigh was audible, indicating relief or exhaustion",
+            "The sound of a sigh broke the silence momentarily",
+            "A gentle exhalation was heard, resembling a sigh",
+            "The sigh was deep and carried a sense of weariness",
+            "A faint, audible sigh signaled contemplation or relief"
+        ]
+    },
+    {
+        "event": "sink_(filling_or_washing)",
+        "phrases": [
+            "running tap in wash basin",
+            "water pouring",
+            "filling a sink",
+            "a faucet pours water",
+            "water flows hard from a faucet into a tub",
+            "tap water",
+            "water faucet running water",
+            "water from the sink and faucet",
+            "sink filling",
+            "a water tap runs into a hollow surface",
+            "kitchen water faucet starting and stopping",
+            "water flows from a sink and faucet",
+            "water gushes and fills a sink",
+            "a water tap is turned on and fills a sink",
+            "water flowing hard from a faucet in short bursts",
+            "washing",
+            "water fills and runs in a sink",
+            "water is running from a faucet into a sink",
+            "running water in sink",
+            "water flowing from a faucet at different intervals",
+            "water flows from a tap into a bathtub",
+            "The sound of water splashing into the sink was steady",
+            "A rhythmic dripping noise came from the filling sink",
+            "The sound of dishes being washed in the sink was distinct",
+            "A soft gurgling noise indicated the sink draining water",
+            "The continuous sound of running water filled the sink"
+        ]
+    },
+    {
+        "event": "siren",
+        "phrases": [
+            "siren",
+            "emergency siren",
+            "emergency vehicle siren",
+            "siren sound",
+            "a siren sounds on an emergency vehicle",
+            "fire emergency vehicle siren",
+            "a remix of a siren is playing and looping",
+            "its siren",
+            "an emergency siren wails loudly",
+            "sirens wail in quick succession",
+            "a siren wails continuously",
+            "siren from emergency vehicle",
+            "a siren blaring continuously",
+            "ambulance siren wail",
+            "a siren wails loudly continuously",
+            "emergency vehicle sirens blare",
+            "emergency vehicle siren blaring",
+            "an emergency vehicle siren wails continuously",
+            "a fire engine sounds its siren",
+            "an emergency siren goes off loudly",
+            "emergency vehicle siren",
+            "fast siren",
+            "emergency vehicle siren blasts",
+            "police emergency vehicle siren",
+            "sirens",
+            "old crank emergency siren",
+            "sire wails",
+            "a siren ringing of a passing emergency vehicle",
+            "the siren of an emergency vehicle sounds and fades away",
+            "A loud siren wailed in the distance",
+            "The siren's pitch rose and fell rhythmically",
+            "A sharp, piercing siren sound filled the air",
+            "The sound of the siren was unmistakable and urgent",
+            "A continuous wailing noise indicated an emergency"
+        ]
+    },
+    {
+        "event": "sizzle",
+        "phrases": [
+            "sizzle",
+            "a quick sizzle",
+            "food sizzle",
+            "sizzling",
+            "sizzles",
+            "liquids sizzle",
+            "sizzle of food",
+            "grease sizzles",
+            "oil sizzle",
+            "fizzing",
+            "sizzle of frying food",
+            "more sizzle",
+            "objects sizzle",
+            "sizzling pan",
+            "soft sizzle",
+            "a continuous sizzle",
+            "a sizzle overhead",
+            "food sizzle while frying",
+            "a sizzle of frying food at the end",
+            "good sizzles",
+            "food sizzles in cookware",
+            "fodd sizzles in a pan",
+            "a frying pan sizzles",
+            "a sizzling sound with multiple layers",
+            "a sizzle sound",
+            "carbonated fizz",
+            "chicken is frying in a pan with a sizzle sound",
+            "food sizzles on a grill",
+            "a pan sizzles",
+            "a continuous sizzle of frying",
+            "The sizzle of fat cooking was sharp and constant",
+            "A rhythmic sizzling noise came from the heated pan",
+            "The sound of sizzling bubbles was audible and distinct",
+            "A soft sizzling sound indicated food being fried",
+            "The sizzle grew louder as the oil heated up"
+        ]
+    },
+    {
+        "event": "slam",
+        "phrases": [
+            "slam",
+            "slamming",
+            "a door slam sound effect",
+            "a slam",
+            "a loud smash",
+            "a loud slam",
+            "a bang",
+            "a glass and steel door slams",
+            "the sounds of slamming",
+            "a slamming thud",
+            "slamming a door",
+            "a simulation of a hit",
+            "a cupboard slams",
+            "a bang of a door closing",
+            "a smash",
+            "a metallic slam",
+            "slams",
+            "a slamming and sound effect noise",
+            "a loud metal object slamming shut",
+            "thud",
+            "a deep thud",
+            "a big slam",
+            "a large bang",
+            "door bang",
+            "door slamming",
+            "heavy impact",
+            "heavy metal door closing",
+            "A loud slam echoed through the room as the door shut",
+            "The sharp sound of a slammed door broke the silence",
+            "The slam was sudden and forceful, resonating loudly",
+            "A heavy slam indicated something closed violently",
+            "The sound of the slam was abrupt and startling"
+        ]
+    },
+    {
+        "event": "slap",
+        "phrases": [
+            "slap",
+            "slapping",
+            "slaps",
+            "smack",
+            "a slap sound",
+            "a wet slap",
+            "smack sound",
+            "hands slapping",
+            "slap and smack sounds",
+            "a slap",
+            "A sharp slapping sound was heard clearly",
+            "The slap was loud and sudden, breaking the quiet",
+            "A quick slapping noise indicated contact between two surfaces",
+            "The slap sound resonated briefly before fading away",
+            "A distinct slap noise was audible from nearby"
+        ]
+    },
+    {
+        "event": "smoke_detector",
+        "phrases": [
+            "a sharp smoke detector beep sounds continuously",
+            "smoke detector beep sound",
+            "smoke detector sound",
+            "a smoke detector is beeping",
+            "a smoke detector alarm",
+            "a smoke detector is making beeping noises",
+            "a smoke detector is ringing",
+            "a smoke detector is ticking",
+            "smoke detectors beep",
+            "a smoke detector beeps",
+            "a smoke detector goes off",
+            "The sharp beeping of a smoke detector was repetitive",
+            "A loud alarm sounded from the smoke detector",
+            "The smoke detector emitted a high-pitched, urgent beep",
+            "A consistent beeping noise indicated a smoke warning",
+            "The sound of the smoke detector was unmistakable and alarming"
+        ]
+    },
+    {
+        "event": "sneeze",
+        "phrases": [
+            "sneeze",
+            "a loud sneeze",
+            "a young person sneezes",
+            "loud male sneeze",
+            "sneezes",
+            "sneeze sound",
+            "a short sneeze",
+            "men sneeze",
+            "a woman sneezes a deep sneeze",
+            "male sneezes",
+            "a person sneezes",
+            "a large sneeze",
+            "an adult female sneezes",
+            "a series of sneezes",
+            "multiple sneezes",
+            "an adult female sneezes once",
+            "man sneezes",
+            "a single loud sneeze",
+            "a sneezing",
+            "sneezing",
+            "a high pitched sneeze",
+            "A loud sneeze broke the silence",
+            "The sound of a sneeze was sudden and forceful",
+            "A quick, sharp sneeze was heard nearby",
+            "The sneeze was abrupt and followed by a soft exhalation",
+            "A muffled sneeze indicated an attempt to suppress it"
+        ]
+    },
+    {
+        "event": "snoring",
+        "phrases": [
+            "snoring",
+            "continuous, light snoring",
+            "snoring man sleeping",
+            "a series of snoring",
+            "snoring over and over",
+            "deep coarse snoring",
+            "snoring continuously",
+            "snoring sound",
+            "snoring from a person",
+            "deep, rough continuous snoring",
+            "low pitched snoring",
+            "low snoring",
+            "snoring intermittently",
+            "deep, loud snoring",
+            "a series of snoring sounds",
+            "a series of snores",
+            "continuous, loud snoring",
+            "continuous repetitive snoring",
+            "deep and low snoring",
+            "snoring sounds one after another",
+            "soft, rapid snoring",
+            "repetitive, loud snoring",
+            "snoring repeating several times",
+            "a person continuously snores in and out",
+            "a sleeping person is snoring rhythmically",
+            "snoring occurs in a rhythmic pattern",
+            "low, slow, soft snoring",
+            "loud, consistent snoring",
+            "rhythmical snoring nearby",
+            "a sleeping person emits a gravely snore",
+            "The soft rumble of snoring was consistent and rhythmic",
+            "A loud, intermittent snore echoed in the room",
+            "The sound of snoring was deep and guttural",
+            "A faint snore was heard in the background",
+            "The rhythmic snoring noises grew louder as the person slept deeply"
+        ]
+    },
+    {
+        "event": "speech",
+        "phrases": [
+            "speech",
+            "person making a speech",
+            "person giving a talk",
+            "make giving speech",
+            "make speech",
+            "a person is giving a speech",
+            "an speech",
+            "a person making a speech",
+            "a person's speech",
+            "a speaker is giving a speech",
+            "a people give a speech",
+            "a human speech",
+            "a young man gives a speech",
+            "a person having a speech",
+            "a speech is delivered",
+            "a speech",
+            "a person giving a speech",
+            "a young man giving a speech",
+            "young man delivering a speech",
+            "a person gives a speech",
+            "human speeches",
+            "human speech",
+            "a man gives a public speech",
+            "a man makes a speech",
+            "a man giving a public speech",
+            "speech of monologue",
+            "a person delivering a speech",
+            "a person speaking a language",
+            "speeches",
+            "A steady flow of speech was heard in a conversational tone",
+            "The speech was clear and articulate",
+            "A rhythmic cadence in the speech made it engaging",
+            "The sound of speech was lively and animated",
+            "A faint murmur of speech could be heard from afar"
+        ]
+    },
+    {
+        "event": "spray",
+        "phrases": [
+            "spray",
+            "sprays",
+            "spraying",
+            "spraying",
+            "short spray",
+            "spray sound",
+            "spray painting",
+            "spray intermittently",
+            "liquid is sprayed",
+            "spraying paint",
+            "a small spray",
+            "someone is spraying",
+            "liquid sprays",
+            "spraying liquid",
+            "compressed liquid spraying",
+            "a sprayer sprays liquid",
+            "a brief spray",
+            "a spray",
+            "a single spray",
+            "sprays burst",
+            "a sprayer sprays",
+            "The sound of liquid spraying was sharp and continuous",
+            "A rhythmic spraying noise came from the nozzle",
+            "The spray produced a soft hissing sound",
+            "A fine mist spray created a faint, audible noise",
+            "The spraying sound was consistent and soothing"
+        ]
+    },
+    {
+        "event": "squawk",
+        "phrases": [
+            "squawks",
+            "a bird squawk",
+            "pigeons are squawking",
+            "birds squawks",
+            "an animal squawks",
+            "birds squawking",
+            "birds are squawking",
+            "a bunch of birds squawking",
+            "A loud squawk pierced the air",
+            "The bird's squawk was harsh and abrasive",
+            "A series of squawks echoed in the distance",
+            "The squawking noise was sharp and unpleasant",
+            "A single squawk sounded abruptly nearby"
+        ]
+    },
+    {
+        "event": "squeak",
+        "phrases": [
+            "squeak",
+            "squeaky",
+            "squeaking",
+            "squeaks",
+            "squeaks sound repeatedly",
+            "squeaky sounds",
+            "a squeak sounds",
+            "another squeak",
+            "squeaky sounds are being made",
+            "a squeaky sound",
+            "squeaky noises",
+            "a squeaky voice",
+            "a squeak sound",
+            "squeak sounds",
+            "a squeaky noise",
+            "a small squeaking",
+            "two squeaks",
+            "shoe squeaking",
+            "squeaking sounds in the background",
+            "several squeaks",
+            "a man is squeaking",
+            "squeaky sound plays",
+            "squeaking loud",
+            "short squeaks",
+            "a squeaking sound",
+            "a squeaking",
+            "squeaks occur",
+            "a squeaking noise",
+            "an object squeaks",
+            "squeaking noise",
+            "A faint squeak came from the floorboards",
+            "The sound of a squeak was high-pitched and brief",
+            "A rhythmic squeaking noise indicated movement",
+            "The squeak was sharp and intermittent",
+            "A soft squeak sounded faintly in the background"
+        ]
+    },
+    {
+        "event": "squeal",
+        "phrases": [
+            "squeal",
+            "a squeal",
+            "squealing",
+            "squealing consistently",
+            "the squealing",
+            "a squealing",
+            "screeching",
+            "a screeching",
+            "a quick squeal",
+            "a squealing sound",
+            "A loud squeal echoed sharply in the air",
+            "The squealing noise was high-pitched and intense",
+            "A faint squeal was heard in the distance",
+            "The sound of the squeal varied slightly in pitch",
+            "A sudden squeal was sharp and startling"
+        ]
+    },
+    {
+        "event": "static",
+        "phrases": [
+            "static",
+            "static mic",
+            "static noise",
+            "radio static sound",
+            "radio static",
+            "static occurs repeatedly",
+            "digital static",
+            "static continues",
+            "radio signal distortion",
+            "heavy static",
+            "a loud static plays continuously",
+            "static fills the microphone",
+            "static occurs continuously",
+            "a static distortion",
+            "a series of static",
+            "continuous static",
+            "a series of radio white noise",
+            "roaring static",
+            "television static",
+            "a static",
+            "microphone static",
+            "static crackles",
+            "portion of static",
+            "telephone static",
+            "The crackling of static was faint but persistent",
+            "A hissing noise of static filled the background",
+            "The sound of static was sharp and irregular",
+            "A burst of static noise was heard over the audio",
+            "The static sound was continuous and slightly crackling"
+        ]
+    },
+    {
+        "event": "steam",
+        "phrases": [
+            "steam",
+            "hissing steam",
+            "steam hisses sharply",
+            "steam hiss",
+            "an iron letting off steam",
+            "steam opening",
+            "steam from train",
+            "a long spray of steam is escaping",
+            "a steam engine lets off steam",
+            "the sounds of steam",
+            "steam releases",
+            "a steamy implement is used to help clean floors",
+            "steam puffs",
+            "steam train hiss",
+            "steam sounds",
+            "steam engine",
+            "the hiss of steam",
+            "the hissing of steam",
+            "a steam engine is hissing and chugging",
+            "steam hisses",
+            "steam hissing repetitively",
+            "its engine steam hisses",
+            "steam mix intermittently",
+            "steam loudly hisses",
+            "the steam hisses",
+            "steam is releasing from an engine",
+            "steam hisses loudly",
+            "the hiss of pressurized steam",
+            "steam is released then stops and is released again",
+            "hissing from steam",
+            "The sound of steam escaping was sharp and hissing",
+            "A faint hissing noise indicated steaming water",
+            "The steam emitted a consistent, gentle sound",
+            "A rhythmic release of steam created a soft whooshing noise",
+            "The sound of steam was soothing and constant"
+        ]
+    },
+    {
+        "event": "steam_whistle",
+        "phrases": [
+            "a steam whistle",
+            "steam whistle",
+            "a steam whistle sounds",
+            "a steam whistle toots",
+            "a steam whistle goes off",
+            "a steam whistle blows",
+            "a train steam whistle",
+            "steam whistles",
+            "steam whistle is roaring",
+            "steam whistle is sounding ",
+            "a steam whistle is triggered",
+            "a loud steam whistle",
+            "a steam whistle",
+            "The steam whistle emitted a sharp, high-pitched tone",
+            "A loud whistle sound came from the steam whistle",
+            "The sound of the steam whistle was piercing and clear",
+            "A rhythmic tooting noise indicated a steam whistle in use",
+            "The whistle sound was abrupt and attention-grabbing"
+        ]
+    },
+    {
+        "event": "stream",
+        "phrases": [
+            "stream",
+            "stream water",
+            "stream water flows",
+            "streams",
+            "water streams",
+            "running water in a river",
+            "stream flowing continuously",
+            "stream of water",
+            "water running down a stream",
+            "streaming waters",
+            "river stream",
+            "river running",
+            "river running down stream",
+            "water trickles down a stream",
+            "a loopable water stream",
+            "a stream of water flowing",
+            "streaming water",
+            "water running down river",
+            "a stream of water flows and trickles",
+            "river water streaming",
+            "a stream of water trickles and flows",
+            "rippling water flows steadily",
+            "a river of water flows",
+            "river of water flowing",
+            "a river stream of water flowing",
+            "a stream of water flowing and trickling",
+            "water stream running",
+            "a small stream",
+            "water trickling down the stream",
+            "a stream of water flows slowly and splashes",
+            "The gentle sound of a stream flowing was soothing",
+            "A rhythmic gurgling noise came from the stream",
+            "The sound of water trickling was clear and persistent",
+            "A soft splashing noise indicated the presence of a stream",
+            "The stream's sound was natural and calming"
+        ]
+    },
+    {
+        "event": "tearing",
+        "phrases": [
+            "tearing",
+            "tearing paper",
+            "paper tearing",
+            "tearing tape",
+            "tearing plastic",
+            "tearing up paper",
+            "tears paper",
+            "paper ripping",
+            "paper tears",
+            "peeling paper",
+            "single paper rip",
+            "tearing of a sheet of toilet paper",
+            "someone is tearing into pieces a small sheet of paper",
+            "tearing masking tape",
+            "tearing aluminum/tin foil",
+            "a paper tear reveal",
+            "tearing noise",
+            "someone is tearing a thin piece of rough cardboard",
+            "tearing paper sound",
+            "consistent ripping and tearing",
+            "someone is tearing up a piece of paper",
+            "The sound of tearing paper was sharp and abrupt",
+            "A faint ripping noise came from tearing fabric",
+            "The tearing sound was quick and distinct",
+            "A rhythmic tearing noise indicated repeated action",
+            "The sound of tearing was rough and jarring"
+        ]
+    },
+    {
+        "event": "telephone",
+        "phrases": [
+            "telephone",
+            "telephones",
+            "phones",
+            "the sounds of telephones",
+            "a telephone dialing occurs repeatedly",
+            "phone being hung up several times",
+            "telephone sounds",
+            "dialing phone",
+            "telephones dial",
+            "telephone noises",
+            "a telephone busy signal sounds",
+            "The sound of a telephone dialing tone was steady and clear",
+            "A rhythmic ringing noise came from the telephone",
+            "The telephone emitted a loud, repetitive beep",
+            "A soft click followed by a tone indicated a call being placed",
+            "The sound of the telephone was electronic and distinct"
+        ]
+    },
+    {
+        "event": "telephone_bell_ringing",
+        "phrases": [
+            "telephone bell",
+            "telephone bell rings",
+            "phone bells",
+            "someone is ringing a bell",
+            "a telephone bell rings intermittently",
+            "telephone bells ring multiple times",
+            "a telephone bell rings",
+            "telephone bells",
+            "a phone is ringing with a mechanical bell",
+            "a phone rings with a real phone bell",
+            "phone bells ring",
+            "telephone bells ring",
+            "telephone bells are ringing repeatedly",
+            "telephone bell ringing sounds",
+            "a telephone bell ringing several times",
+            "a bell telephone rings",
+            "a telephone is ringing inside an office room",
+            "ringing phone",
+            "a telephone bell rings with mechanisms",
+            "telephone bell ringing",
+            "telephone bells ringing",
+            "electronic phone ring",
+            "a phone rings and goes to busy",
+            "telephone rings",
+            "a telephone and its bell ringing",
+            "the sound of a telephone bell ringing",
+            "a telephone rings loudly two times",
+            "The telephone bell rang with a sharp, metallic tone",
+            "A rhythmic ringing sound indicated an incoming call",
+            "The sound of the telephone bell was loud and clear",
+            "A persistent ringing noise came from the telephone",
+            "The telephone bell emitted a distinct, repetitive chime"
+        ]
+    },
+    {
+        "event": "telephone_dialing",
+        "phrases": [
+            "electronic touch tone telephone dialing",
+            "dialing phone",
+            "a series of telephone keys dialing",
+            "telephones dial and ring",
+            "telephone dials",
+            "dialing",
+            "phone number dialing",
+            "he dials an old-fashioned phone",
+            "telephone dialing mechanisms echoing",
+            "telephone dial tones",
+            "dialing on a phone using touch tone dialing",
+            "telephone dialing and ringing",
+            "old phone number dial system is being used",
+            "a telephone dials and rings",
+            "dialing a telephone and ringing",
+            "someone dials on a rotary telephone",
+            "a telephone dials and tones",
+            "a phone is picking up and dialing a number",
+            "a telephone dialing occurs repeatedly",
+            "a telephone is ringing, dialing, and being answered",
+            "a series of telephone dialing tones",
+            "phones dial",
+            "manual telephone ringing",
+            "a telephone dialing tone ringing",
+            "a phone rings and goes to busy",
+            "a woman dials a telephone",
+            "telephone dialing sounds",
+            "telephone dialing and clicking sounds",
+            "someone is dialing a number on an old telephone",
+            "dialing occurs on a telephone",
+            "A sequence of tones indicated telephone dialing",
+            "The sound of dialing was rhythmic and electronic",
+            "A high-pitched beep followed by a pause indicated a digit entry",
+            "The dialing sounds were sharp and distinct",
+            "A series of quick beeps signaled dialing activity"
+        ]
+    },
+    {
+        "event": "throat_clearing",
+        "phrases": [
+            "throat clearing",
+            "throat clearing sounds",
+            "a person throat clearing",
+            "throat clearing noise",
+            "a man is making throat clearing noises",
+            "a person clearing his throat",
+            "someone makes throat sounds",
+            "A sharp throat-clearing sound broke the silence",
+            "The sound of throat clearing was abrupt and deliberate",
+            "A soft clearing noise indicated an attempt to gain attention",
+            "The throat-clearing sound was low and guttural",
+            "A faint throat-clearing noise was heard in the background"
+        ]
+    },
+    {
+        "event": "thump",
+        "phrases": [
+            "thump",
+            "thud",
+            "whack",
+            "hitting",
+            "thumping occurs",
+            "a hitting sound",
+            "something thumps",
+            "hitting sounds",
+            "thud sound",
+            "whack sound",
+            "a hit sound",
+            "a thud sound",
+            "A heavy thump echoed through the room",
+            "The sound of a thump was dull and abrupt",
+            "A rhythmic thumping noise indicated repeated impact",
+            "The thump was deep and resonant",
+            "A faint thump could be heard from the distance"
+        ]
+    },
+    {
+        "event": "thunder",
+        "phrases": [
+            "thunder",
+            "thunderstorm",
+            "crashing thunder",
+            "thunders",
+            "thunder is striking during a storm",
+            "thunder storm",
+            "loud thunderstorm",
+            "loud thunder clap",
+            "loud thunderclap",
+            "loud thunder that cracks five times",
+            "rolling thunder",
+            "the thunder",
+            "a close thunder strike",
+            "dry thunder",
+            "single close strong thunder",
+            "loud thunder",
+            "thunder is being recorded and remastered",
+            "thunderstorms",
+            "loud roars of thunder",
+            "a loud thunder strike",
+            "thunderstorm rumbles outside",
+            "thunder crashes",
+            "thunderstorm rumbles",
+            "heavy thunder",
+            "light thunder",
+            "thunder loudly",
+            "thunder sound",
+            "thunder slamming",
+            "rumbling thunder",
+            "thunder with reverb",
+            "A loud crack of thunder echoed sharply across the sky",
+            "The rumbling sound of thunder grew louder as the storm approached",
+            "A distant roll of thunder was faint but persistent",
+            "The thunderclap was sudden and startling",
+            "The low, resonant thunder shook the atmosphere"
+        ]
+    },
+    {
+        "event": "thunderstorm",
+        "phrases": [
+            "thunderstorm",
+            "thunder storm",
+            "thunderstorm sound",
+            "loud thunderstorm",
+            "thunderstorms",
+            "thunder is striking during a storm",
+            "crashing thunderstorms",
+            "thunderstorm rumbles outside",
+            "loud thunderstorm",
+            "thunderstorm sounds",
+            "thunderstorms roar",
+            "thunderstorm sounds with lightning",
+            "thundering with the rain coming down in sheets",
+            "a thunderstorm is looping",
+            "thunderstorm rumbles",
+            "thunderstorms rage",
+            "thunderstorms rumble",
+            "The sound of thunder rumbled continuously during the storm",
+            "A mix of heavy rain and thunder created a dramatic atmosphere",
+            "The thunderstorm produced sharp cracks followed by deep rumbles",
+            "The noise of the thunderstorm was intense and unrelenting",
+            "The soundscape was filled with overlapping thunderclaps and rain"
+        ]
+    },
+    {
+        "event": "thunk",
+        "phrases": [
+            "thunk sound",
+            "thud",
+            "thump",
+            "thud",
+            "whack",
+            "hitting",
+            "thumping occurs",
+            "a hitting sound",
+            "something thumps",
+            "hitting sounds",
+            "thud sound",
+            "whack sound",
+            "a hit sound",
+            "a thud sound",
+            "thunking",
+            "a thunk sound",
+            "A dull thunk echoed as the object hit the surface",
+            "The thunk was low-pitched and hollow",
+            "A faint thunk indicated something falling nearby",
+            "The sound of the thunk was abrupt and dampened",
+            "A repetitive thunking noise came from the distance"
+        ]
+    },
+    {
+        "event": "tick",
+        "phrases": [
+            "tick",
+            "a clock ticking",
+            "a clock tick-tocks",
+            "ticking",
+            "ticking sound",
+            "ticking from a clock",
+            "a ticking clock",
+            "a clock ticks",
+            "tick-tock sound",
+            "a tick",
+            "A sharp tick echoed in the quiet room",
+            "The ticking sound was steady and rhythmic",
+            "A faint metallic tick could be heard in the background",
+            "The sound of ticking was sharp and precise",
+            "A series of rapid ticks indicated a fast-moving mechanism"
+        ]
+    },
+    {
+        "event": "tick-tock",
+        "phrases": [
+            "tick-tock",
+            "tick-tocking",
+            "tick-tocks",
+            "tick-tock sounds",
+            "a tick-tock",
+            "tick-tock noise",
+            "a tick-tock occurs",
+            "tick-tocking consistently",
+            "a loud tick-tock",
+            "tick-tock sounds intermittently",
+            "a tick-tocking",
+            "tick-tocking sounds",
+            "a tick-tock rhythm",
+            "tick-tock of the pendulum",
+            "tick tocking",
+            "tick-tock of a clock",
+            "a series of medium tick-tocks",
+            "a tick-tock repeats rhythmically",
+            "tick-tock goes a clock",
+            "a tick-tock of a clock",
+            "tick-tocking by a clock",
+            "tick-tocking of a clock",
+            "a tick tock",
+            "loud, slow tick-tocking",
+            "tick tock of a clock",
+            "a tick tock of a clock",
+            "a low, soft tick-tock",
+            "tick-tock of a single clock",
+            "rhythmic tick-tocking",
+            "a tick-tock sound",
+            "The repetitive tick-tock of a clock was soothing",
+            "A steady tick-tock noise filled the air",
+            "The sound of tick-tock was rhythmic and calming",
+            "The clock's tick-tock echoed faintly in the room",
+            "A sharp tick followed by a soft tock created the classic clock sound"
+        ]
+    },
+    {
+        "event": "tire_squeal",
+        "phrases": [
+            "tire-squealing",
+            "tires skid and squeal loudly",
+            "tire squeal",
+            "a car skids very loudly",
+            "tire skids",
+            "vehicle skidding",
+            "tire skidding",
+            "vehicle tires squeal loudly",
+            "tires squeal while skidding",
+            "tires screech around a turn",
+            "cars skids",
+            "tires squeal",
+            "tires squeal the entire time",
+            "race car skidding",
+            "vehicle skid very loudly nearby",
+            "tire squealing",
+            "a car continuously skids",
+            "a car accelerates skidding",
+            "tires skid",
+            "tires skid and squeal",
+            "a car skids and honks",
+            "vehicles squeal tires",
+            "tires skidding",
+            "a vehicle accelerates and skids",
+            "an vehicle accelerates and skids",
+            "vehicle squealing tires",
+            "tires squeal and skid",
+            "cars are skidding with tire squeal",
+            "vehicle tires skidding",
+            "tires screech and squeal",
+            "A sharp tire squeal pierced the air as the vehicle braked",
+            "The high-pitched squealing noise was sudden and intense",
+            "A prolonged tire squeal indicated rapid acceleration",
+            "The sound of squealing tires echoed across the road",
+            "A faint tire squeal could be heard in the distance"
+        ]
+    },
+    {
+        "event": "toilet_flush",
+        "phrases": [
+            "a toilet is being flushed in a state room",
+            "water flows down a flushed toilet",
+            "water flows down a toilet",
+            "water rushes down the toilet",
+            "a woman flushes a toilet",
+            "toilets flush",
+            "water runs down a flushed toilet",
+            "a toilet is flushed for a few seconds",
+            "a toilet flushing in the background",
+            "toilet flushes abruptly",
+            "bathroom WC water",
+            "water flowing down a flushed toilet",
+            "an industrial toilet is flushed and drains",
+            "a toilet flushes in the background",
+            "a toilet flushes and water runs",
+            "a commercial toilet flushes extra fast",
+            "toilet is being flushed by water",
+            "water running down a flushed toilet",
+            "a toilet flushes lengthily",
+            "a toilet flushes and drains speedily",
+            "a toilet flushes and runs",
+            "flushing the toilet",
+            "water running from a flushed toilet",
+            "a toilet flushes loudly",
+            "tank valve is off and flush",
+            "a toilet flush in the background",
+            "a toilet flushes quickly",
+            "a toilet is being flushed in a small bathroom",
+            "toilet flushing and water running",
+            "The sound of a toilet flushing was abrupt and gushing",
+            "A loud whooshing noise accompanied the toilet flush",
+            "The toilet flushing sound was steady and brief",
+            "A gurgling noise followed the toilet flush, indicating drainage",
+            "The sound of the toilet flushing faded quickly"
+        ]
+    },
+    {
+        "event": "traffic_noise",
+        "phrases": [
+            "traffic noise",
+            "traffic sounds",
+            "traffic sounds are present and ongoing",
+            "traffic nearby",
+            "traffic is making noise",
+            "ambient highway noise",
+            "the sound of traffic",
+            "traffic noise fills the roadways",
+            "traffic ambiance in the background",
+            "city traffic sounds",
+            "the noise of traffic",
+            "traffic is ongoing",
+            "road noise occurs",
+            "traffic noises",
+            "traffic sounds on road",
+            "traffic flows",
+            "traffic sounds are looped",
+            "traffic noise in an urban setting",
+            "sound near a highway",
+            "sounds of traffic",
+            "traffic background",
+            "some traffic noise",
+            "traffic makes noises in the distance",
+            "traffic noise on the road",
+            "traffic fills the streets",
+            "traffic is near by",
+            "traffic in background",
+            "traffic sounds in the background",
+            "traffic noise in the street",
+            "traffic is present",
+            "The constant hum of traffic noise filled the city",
+            "A mix of honks and engine sounds created a bustling atmosphere",
+            "The sound of traffic noise was loud and unrelenting",
+            "A faint background noise of vehicles could be heard",
+            "The traffic noise grew louder during rush hour"
+        ]
+    },
+    {
+        "event": "train",
+        "phrases": [
+            "train",
+            "train sound",
+            "train passing by",
+            "train running",
+            "a train runs slowly on railroad tracks",
+            "a train moving",
+            "a train goes by",
+            "a train approaches",
+            "rustling of a train passing",
+            "a train running on railroad tracks",
+            "a train speeds by",
+            "brass suspension scary theme",
+            "death orb sounds",
+            "sound cue",
+            "camera interaction",
+            "chord progression",
+            "fire alarm sound",
+            "original beat",
+            "downshifting",
+            "a corporate rise-and-hit logo sound",
+            "pedal point",
+            "action sound",
+            "extended tail version",
+            "warbling suspenseful sound",
+            "putting the lid up",
+            "ball sound",
+            "wind down sound",
+            "catchy ad jingle",
+            "love gate sound",
+            "slide sound",
+            "The rumble of the train grew louder as it approached",
+            "A distinctive clattering noise came from the train wheels on the tracks",
+            "The sound of the train was rhythmic and mechanical",
+            "A faint train whistle accompanied the rumbling noise",
+            "The noise of the train echoed across the open landscape"
+        ]
+    },
+    {
+        "event": "train_horn",
+        "phrases": [
+            "a train whistle repeats multiple times",
+            "a train sounds the horn at a regular pace",
+            "a train's horn runs",
+            "train horn",
+            "a train horn sounds and echoes",
+            "a train blowing a horn twice",
+            "a training horn emits two lingering sounds",
+            "a train horn blows in the distance before becoming louder",
+            "a train whistle blares multiple times",
+            "a train horn sound",
+            "short train horn blast",
+            "train horn sounding",
+            "train horn audio recording",
+            "commuter train sound",
+            "a train car whistle",
+            "a train horn blares twice",
+            "a train horn sounds and approaches quickly",
+            "a train horn blares multiple times",
+            "a train horn sounds quickly",
+            "a train blows its horn twice",
+            "a train blowing its horn twice",
+            "a loud but brief train horn blares",
+            "two train horns sound",
+            "a train whistle blows three times",
+            "a train blowing its horn once",
+            "a train horn is triggered",
+            "a train horn sounds loudly and long",
+            "a train warning horn",
+            "a train horn sounds",
+            "a train horn blows three times",
+            "The train horn blared loudly, signaling its approach",
+            "A deep, resonant horn sound echoed through the area",
+            "The train horn was sharp and attention-grabbing",
+            "A prolonged horn blast indicated the train's presence",
+            "The sound of the train horn grew fainter as it moved away"
+        ]
+    },
+    {
+        "event": "train_whistle",
+        "phrases": [
+            "a train whistle repeats multiple times",
+            "a train sounds the horn at a regular pace",
+            "a train's horn runs",
+            "train horn",
+            "a train horn sounds and echoes",
+            "a train blowing a horn twice",
+            "a training horn emits two lingering sounds",
+            "a train horn blows in the distance before becoming louder",
+            "a train whistle blares multiple times",
+            "a train horn sound",
+            "short train horn blast",
+            "train horn sounding",
+            "train horn audio recording",
+            "commuter train sound",
+            "a train car whistle",
+            "a train horn blares twice",
+            "a train horn sounds and approaches quickly",
+            "a train horn blares multiple times",
+            "a train horn sounds quickly",
+            "a train blows its horn twice",
+            "a train blowing its horn twice",
+            "a loud but brief train horn blares",
+            "two train horns sound",
+            "a train whistle blows three times",
+            "a train blowing its horn once",
+            "a train horn is triggered",
+            "a train horn sounds loudly and long",
+            "a train warning horn",
+            "a train horn sounds",
+            "a train horn blows three times",
+            "The train whistle emitted a clear, high-pitched tone",
+            "A sharp whistle noise signaled the train's arrival",
+            "The sound of the train whistle was distinct and melodic",
+            "A long, echoing whistle was heard in the distance",
+            "The train whistle grew louder as the locomotive neared"
+        ]
+    },
+    {
+        "event": "trickle",
+        "phrases": [
+            "trickle",
+            "dribble",
+            "water dribbles nearby",
+            "water dribbles",
+            "water trickles",
+            "a stream trickles somewhere very close by",
+            "water trickles as it flows down",
+            "water trickling",
+            "water trickles and splashes",
+            "water flows at a steady trickle",
+            "trickling",
+            "some liquid trickles",
+            "trickle sounds",
+            "a stream of water flows and trickles",
+            "water trickling continuously",
+            "water is trickling",
+            "trickles",
+            "The soft trickle of water was soothing to hear",
+            "A faint trickling noise came from the nearby stream",
+            "The sound of trickling water was rhythmic and calming",
+            "A gentle trickle was audible in the background",
+            "The trickling noise grew louder as the water flowed faster"
+        ]
+    },
+    {
+        "event": "truck",
+        "phrases": [
+            "truck",
+            "truck running",
+            "truck engine",
+            "a loud truck engine",
+            "a truck is reversing",
+            "truck engine slows",
+            "a truck engine is revving up",
+            "a truck moves at constant pace",
+            "truck moving",
+            "a loud truck engine riving up",
+            "a truck accelerates repeatedly",
+            "a truck engine is accelerated",
+            "a loud truck engine riving up again",
+            "a big truck is departing",
+            "a truck drives",
+            "a truck engine goes by and slows",
+            "truck sounds",
+            "a truck is reversing and accelerating",
+            "a truck is making vroom sounds",
+            "a truck is accelerating and revving",
+            "a truck engine accelerating",
+            "a truck acceleration",
+            "a big truck sound",
+            "a truck engine is working at regular speed",
+            "a dump truck is reversing",
+            "a truck travels",
+            "a truck is stopping and accelerating",
+            "a truck goes by",
+            "a truck or something",
+            "a truck is running outdoor",
+            "The deep rumble of a truck's engine echoed nearby",
+            "A loud, mechanical noise came from the truck as it passed",
+            "The truck's engine produced a steady droning sound",
+            "A rhythmic clattering noise was heard as the truck moved over uneven ground",
+            "The sound of the truck faded as it drove away"
+        ]
+    },
+    {
+        "event": "turkey",
+        "phrases": [
+            "turkeys",
+            "turkey calls",
+            "turkey sounds",
+            "turkey gobbling",
+            "the sounds of a turkey",
+            "turkey vocalizations",
+            "turkeys vocalize",
+            "the sounds of turkeys",
+            "a turkey",
+            "turkeys speaking",
+            "gobbles of turkeys",
+            "turkeys gobbling",
+            "calls of turkeys",
+            "turkeys are making calls continuously",
+            "a turkey calls",
+            "a variety of turkey sounds",
+            "turkeys gobble",
+            "turkeys gobble loudly",
+            "a group of turkeys making sounds",
+            "turkeys make calls",
+            "a turkey gobbles",
+            "gobbling turkeys",
+            "chicken clucks",
+            "a turkey gobbling",
+            "male turkeys fighting",
+            "A loud gobble sound was emitted by the turkey",
+            "The turkey's gobbling noise was sharp and repetitive",
+            "A rhythmic gobble echoed in the distance",
+            "The sound of the turkey was distinct and attention-grabbing",
+            "A faint turkey gobble was heard in the background"
+        ]
+    },
+    {
+        "event": "typewriter",
+        "phrases": [
+            "typewriter",
+            "the sounds of a typewriter",
+            "typewriter mechanisms",
+            "a typewriter punctuates brief mechanisms",
+            "a typewriter",
+            "typewriter keys clack repeatedly",
+            "a typewriter makes sounds with scrapes and pings",
+            "typewriter sound effects",
+            "typewriter clicking to mechanisms",
+            "a typewriter types",
+            "typewriter clicks",
+            "typewriters",
+            "typing on a typewriter with clicking and clacking",
+            "typewriter noises",
+            "a manual typewriter is being used with single and multiple line feeds",
+            "a portable typewriter is typing with automatic spacing",
+            "an old fashioned typewriter being typed on quickly",
+            "typewriter typing",
+            "typewriter keys clack",
+            "typing on an old-fashioned typewriter",
+            "typewriter roller and ring",
+            "a typewriter functioning",
+            "a typewriter clicks and clacks",
+            "typing sounds from a typewriter",
+            "someone is typing fast on an old-fashioned typewriter",
+            "a typewriter types with a ding",
+            "typing on a typewriter",
+            "a typewriter operates",
+            "a typewriter in use",
+            "a typewriter clacks",
+            "The typewriter emitted a sharp clicking sound as keys were pressed",
+            "A rhythmic series of clicks and dings came from the typewriter",
+            "The sound of the typewriter was mechanical and precise",
+            "A faint clattering noise indicated the typewriter was in use",
+            "The typewriter's noise was sharp and consistent"
+        ]
+    },
+    {
+        "event": "typing",
+        "phrases": [
+            "typing",
+            "keyboard typing sounds",
+            "keyboarding sounds",
+            "typing very fast",
+            "typing sounds on a computer keyboard",
+            "computer keyboard sounds",
+            "typing sounds on computer keyboard",
+            "rapid keyboard typing",
+            "typing noises",
+            "typing on a keyboard is ongoing in the foreground",
+            "typing noise",
+            "short typing pattern on keyboard being repeated a few times",
+            "rapid typing",
+            "computer keyboard clicking sounds",
+            "typing sounds from a computer keyboard",
+            "typing sound of a computer keyboard",
+            "typing produces clicks on a keyboard",
+            "computer keyboard typing sounds",
+            "computer keyboard mechanisms",
+            "rapid typing on keyboard",
+            "typing sounds",
+            "keyboard sounds",
+            "clicking computer keyboard sounds",
+            "typing continuously",
+            "typing sounds on typewriter",
+            "intermittent typing on computer keyboard",
+            "rapid typing on a keyboard",
+            "a sequence of quick typing keystroke clanking",
+            "someone is entering a password using a keyboard",
+            "rapid typing of keyboard",
+            "The sound of typing was quick and rhythmic",
+            "A sharp clicking noise came from the keyboard",
+            "The typing sound grew louder as the pace increased",
+            "A faint tapping noise indicated someone typing nearby",
+            "The typing noise was steady and mechanical"
+        ]
+    },
+    {
+        "event": "vacuum_cleaner",
+        "phrases": [
+            "vacuum cleaner",
+            "vacuum",
+            "vacuum cleaner is being turned on and off",
+            "wood sander",
+            "a vacuum cleaner is starting and cutting off",
+            "vacuum cleaner is moving back and forth",
+            "vacuum cleaner being turned on and off",
+            "blending",
+            "a cylinder-type vacuum cleaner is stopping",
+            "someone is turning off a vacuum cleaner",
+            "a vacuum cleaner is switching off",
+            "a vacuum cleaner operates while making contact with a surface",
+            "a vacuum cleaner is turning on and off",
+            "a vacuum cleaner is being used on various surfaces",
+            "a vacuum cleaner is in use and stops",
+            "a vacuum cleaner is stopping on a carpet",
+            "hoover is being turned on/off",
+            "spraying the foam",
+            "a vacuum cleaner is running and making surface contact",
+            "air dryer sounds",
+            "a hand dryer in a bathroom is playing",
+            "someone is cleaning the house with an aspirator",
+            "vacuum running",
+            "someone is using a vacuum in their house",
+            "blender sounds",
+            "air freshener sound",
+            "a vacuum cleaner runs",
+            "street sweeper noise collector",
+            "a vacuum cleaner operates",
+            "air-compressor sounds",
+            "The vacuum cleaner emitted a loud, steady hum",
+            "A rhythmic suction noise came from the vacuum cleaner",
+            "The sound of the vacuum cleaner was mechanical and droning",
+            "A faint whirring noise indicated the vacuum cleaner was in use",
+            "The vacuum cleaner's noise grew softer as it moved to another room"
+        ]
+    },
+    {
+        "event": "vehicle_horn",
+        "phrases": [
+            "a car toots short",
+            "a horn honks in different tones repeatedly",
+            "vehicle horns honking several times",
+            "a series of a vehicle horn sounding",
+            "a car horn is honked several times in a row",
+            "a small vehicle horn toots once",
+            "a vehicle horn honking many times",
+            "a car horn goes off four times in two sets",
+            "a brief, loud car horn",
+            "a couple of car horns honking one after the other",
+            "a small car horn toots a few times",
+            "vehicle horns are triggered several times",
+            "a vehicle horn honks repeatedly and loudly",
+            "a car horn is honked several times",
+            "person is repeatedly hitting their car horn over and over and over",
+            "custom car horn",
+            "honking car horn sounds",
+            "an antique car horn honks repeatedly",
+            "vehicle car horn alarm blasts repeatedly",
+            "vehicle honking horn several times",
+            "a small car horn blows three times",
+            "a vehicle horn beeps loudly several times",
+            "a car horn honks melodically in different tones",
+            "a vehicle horn honks in alternating tones",
+            "a vehicle honking its horn several times",
+            "vehicle honking its horn",
+            "a horn is sounded on a moped",
+            "a vehicle honking at irregular intervals",
+            "A loud honk came from the vehicle horn",
+            "The horn's noise was sharp and attention-grabbing",
+            "A rhythmic honking sound indicated urgency",
+            "The vehicle horn blared continuously in the traffic",
+            "A faint honk was heard in the distance"
+        ]
+    },
+    {
+        "event": "walk",
+        "phrases": [
+            "walk",
+            "mid run sound",
+            "stepping",
+            "footsteps",
+            "walking",
+            "walk sound",
+            "footsteps walk",
+            "a man walks",
+            "walking occurs",
+            "a person walks along",
+            "light footsteps",
+            "footsteps occur briefly",
+            "sound of footsteps",
+            "sound of walking",
+            "footsteps take place",
+            "The sound of footsteps was steady and rhythmic",
+            "A faint tapping noise came from shoes hitting the ground",
+            "The walking noise grew louder as the person approached",
+            "A soft shuffling sound indicated slow walking",
+            "The rhythmic clatter of footsteps echoed in the hallway"
+        ]
+    },
+    {
+        "event": "water",
+        "phrases": [
+            "water",
+            "water effects",
+            "water sound effect",
+            "water from the fountain",
+            "water spring",
+            "gentle water effects",
+            "a water sound effect",
+            "a fountain of water flows",
+            "water trickles down into more water",
+            "the sounds of water",
+            "water flows in a bowl",
+            "water sounds",
+            "liquid",
+            "the water",
+            "water pours out onto a surface",
+            "a pouring water source streams",
+            "water falling onto itself",
+            "water filling a container",
+            "water is running into a mug",
+            "water makes contact with a surface",
+            "water fountain field recording",
+            "water fills a tap",
+            "filling water",
+            "water is splashing down into a basin",
+            "water runs onto itself",
+            "water runs once more",
+            "a water tap flows and splashes",
+            "some liquid flow is released several times",
+            "spilling water",
+            "fountain",
+            "The sound of water splashing was soft and rhythmic",
+            "A steady trickling noise indicated flowing water",
+            "The sound of water dripping was sharp and distinct",
+            "A bubbling noise came from the water in motion",
+            "The gentle sound of water created a soothing ambiance"
+        ]
+    },
+    {
+        "event": "water_tap",
+        "phrases": [
+            "water faucet",
+            "water tap sound",
+            "flush sound",
+            "a water tap runs briefly",
+            "a water tap runs and splashes",
+            "a water faucet is running",
+            "water runs from a faucet",
+            "a water faucet pouring",
+            "faucet runs",
+            "faucet water dripping",
+            "faucet water pouring",
+            "water trickling from a faucet",
+            "The water tap emitted a steady gushing sound as it was opened",
+            "A rhythmic dripping noise came from the tap left slightly open",
+            "The sound of the water tap turning off was abrupt and final",
+            "A faint hissing noise came from the water tap under pressure",
+            "The sound of water flowing from the tap was clear and steady"
+        ]
+    },
+    {
+        "event": "waterfall",
+        "phrases": [
+            "waterfall",
+            "waterfalls",
+            "a stream of water falls",
+            "a big waterfall",
+            "a large waterfall",
+            "a waterfall cascades",
+            "a huge waterfall",
+            "water falls",
+            "water cascades down a waterfall",
+            "a waterfall",
+            "the rushing of a waterfall",
+            "a large stream of water",
+            "water flows in a waterfall",
+            "water cascades down in a waterfall",
+            "loud rushing water from a river",
+            "a stream and waterfall",
+            "a large flow of liquid",
+            "a waterfall ambience",
+            "a loud roar of a waterfall",
+            "a stream of water rushing rapidly",
+            "a waterfall in the rural area",
+            "stream rushes loudly",
+            "a heavy stream of water",
+            "a waterfall is pouring into a stream",
+            "a small waterfall",
+            "a strong and powerful flowing waterfall",
+            "continuous roaring of a waterfall",
+            "The sound of the waterfall was loud and continuous",
+            "A deep roaring noise came from the cascading water",
+            "The waterfall's sound was rhythmic and powerful",
+            "A soft misty spray accompanied the sound of the waterfall",
+            "The distant roar of a waterfall was faint but distinct"
+        ]
+    },
+    {
+        "event": "waves",
+        "phrases": [
+            "waves",
+            "waves (surf)",
+            "surf waves",
+            "seawash",
+            "ocean waves",
+            "close ocean waves",
+            "waves break",
+            "waves break continues in succession",
+            "ocean",
+            "surf comes ashore",
+            "ocean waves are repeatedly splashing on shore",
+            "waves are moving and crashing in the background",
+            "waves roll slowly",
+            "ocean waves break",
+            "waves move aside",
+            "ocean waves ebb and flow",
+            "waves splash several times",
+            "large waves hit against a beach continuously",
+            "waves are continuously washing onto shore",
+            "ocean waves are breaking and crashing onto shore",
+            "the waves rush off",
+            "ocean waves are moving at a moderate pace",
+            "ocean currents",
+            "waves crashing onto shore continuously",
+            "surf",
+            "waves continuously crashing to shore",
+            "the ocean",
+            "the ocean waves are hitting the shore at a moderate pace",
+            "ocean waves are crashing and splashing onto shore",
+            "ocean waves repeatedly crash",
+            "The gentle lapping of waves was soothing to hear",
+            "A rhythmic crashing noise came as waves hit the shore",
+            "The sound of waves rolling was deep and continuous",
+            "A faint splashing noise indicated distant waves",
+            "The sound of waves breaking was sharp and distinct"
+        ]
+    },
+    {
+        "event": "whip",
+        "phrases": [
+            "whip",
+            "whips",
+            "a whip",
+            "whip cracks",
+            "whips smack fiberglass",
+            "whip sounds",
+            "the sounds of whipping",
+            "whip whooshing",
+            "whips crack",
+            "whips and whacks",
+            "whipping",
+            "the sound of a whip",
+            "a whip crack",
+            "a sudden whip",
+            "a whipping rush",
+            "whips crack in the wind",
+            "whip cracking",
+            "the sound of a whip cracking",
+            "whip noises",
+            "whips cracking in the wind",
+            "A sharp cracking sound came from the whip",
+            "The whip's motion produced a loud, snapping noise",
+            "A faint whistling noise preceded the whip's crack",
+            "The sound of the whip was sudden and startling",
+            "A rhythmic whipping noise indicated repeated motion"
+        ]
+    },
+    {
+        "event": "whispering",
+        "phrases": [
+            "whispering",
+            "whispering noise",
+            "whisling",
+            "whispering sounds",
+            "whispering noises",
+            "whispering with reverb",
+            "whispering in a small room",
+            "whispered ",
+            "whispering human voices",
+            "friendly ghost is whispering with variation",
+            "someone is whispering \"dare to be you\"",
+            "someone is whispering secrets to someone else",
+            "whispering in a large room",
+            "a whispering ghost sound",
+            "whispering sounds in the background",
+            "fantasy whispers are being whispered",
+            "whispered speech",
+            "human whispering",
+            "a person whispers",
+            "a young person whispers",
+            "someone is whispering",
+            "a female whispering",
+            "a person whispers in a small room",
+            "whispered words",
+            "someone is whispering",
+            "The sound of whispering was soft and indistinct",
+            "A faint whisper could be heard in the quiet room",
+            "The whispering noise was rhythmic and calming",
+            "A soft murmur of whispering filled the background",
+            "The whisper grew louder as the speaker leaned closer"
+        ]
+    },
+    {
+        "event": "whistle",
+        "phrases": [
+            "whistle",
+            "the whistle",
+            "a whistle",
+            "a louder whistle",
+            "a whistle sample",
+            "a whistle sound",
+            "multiple whistle sounds",
+            "a three note whistle",
+            "a sharp whistle",
+            "a mouth whistle",
+            "a whistle sounds",
+            "a whistle chirp",
+            "a series of whistling",
+            "a short whistle",
+            "the whistle gets louder",
+            "boys whistle a specific pattern",
+            "a human whistle",
+            "another whistle",
+            "a small rising whistle",
+            "melodical whistling",
+            "one of them whistles loudly",
+            "two quick whistle sounds",
+            "a young person whistles loudly and continuously",
+            "a series of sharp whistling",
+            "people whistle",
+            "several loud whistles",
+            "whistles",
+            "a loud whistle",
+            "whistle noise",
+            "A sharp whistle pierced the air",
+            "The whistle produced a high-pitched, clear tone",
+            "A rhythmic whistling noise came from the instrument",
+            "The sound of the whistle was attention-grabbing and distinct",
+            "A faint whistle could be heard in the distance"
+        ]
+    },
+    {
+        "event": "whistling",
+        "phrases": [
+            "whistling",
+            "whistling begins",
+            "whistling nearby",
+            "whistling consistently",
+            "whistling sound",
+            "whistling occurs",
+            "whistling sounds",
+            "whistling sound being made",
+            "whistling is ongoing",
+            "whistling is heard",
+            "whistling noises",
+            "whistling repeating",
+            "whistling noise",
+            "whistles sound",
+            "whistles",
+            "whistling of a person",
+            "whistling from a small group",
+            "whistling from a person",
+            "whistling takes place",
+            "continuous whistling",
+            "whistling takes place repeatedly",
+            "a woman whistles",
+            "an upbeat whistling",
+            "whistling noise in background",
+            "a woman whistles a tune",
+            "flirty whistling",
+            "constant whistling",
+            "consistent musical whistling",
+            "whistling noises that occur a few times",
+            "human whistling",
+            "A high-pitched whistling noise was steady and melodic",
+            "The sound of whistling was rhythmic and cheerful",
+            "A faint whistling noise came from someone nearby",
+            "The whistling sound grew louder as it approached",
+            "The whistling noise was sharp and clear"
+        ]
+    },
+    {
+        "event": "whoosh",
+        "phrases": [
+            "whoosh",
+            "a whoosh",
+            "a stereo whoosh",
+            "whoosh-swoosh",
+            "swoosh",
+            "fast whoosh",
+            "a large whoosh",
+            "a rushing whoosh",
+            "a loud whoosh",
+            "a quick whoosh",
+            "a fast whoosh",
+            "a woosh",
+            "a swooshing",
+            "a swish",
+            "an analog whoosh effect",
+            "woosh",
+            "a loud, sweeping whoosh",
+            "a whooshing sweep",
+            "whooses",
+            "a whoosh goes by",
+            "whooshes",
+            "whooshing",
+            "objects whoosh",
+            "missil whoosh",
+            "a whoosh occurs",
+            "a whooshing",
+            "objects whoosh by",
+            "a whoosh",
+            "A loud whooshing noise came from something moving quickly through the air",
+            "The whoosh was sharp and sudden, fading quickly",
+            "A faint whooshing noise indicated distant motion",
+            "The sound of the whoosh was sibilant and brief",
+            "A steady whooshing noise accompanied the rapid movement"
+        ]
+    },
+    {
+        "event": "wind_chime",
+        "phrases": [
+            "wind chimes blowing",
+            "wind chimes softly ringing",
+            "chiming wind chimes",
+            "ringing of wind chimes",
+            "wind chimes are jingling",
+            "wind chimes are playing",
+            "a wind chime is sounding",
+            "a wind chime is clanging",
+            "wind chimes are being played",
+            "wind chimes are being run",
+            "wind chime sound",
+            "wind chimes are blowing in the wind",
+            "The wind chime produced a soft, melodic tinkling sound",
+            "A rhythmic chiming noise came as the wind blew",
+            "The sound of the wind chime was delicate and calming",
+            "A faint tinkling noise indicated distant wind chimes",
+            "The wind chime's sound was harmonious and gentle"
+        ]
+    },
+    {
+        "event": "yell",
+        "phrases": [
+            "yelling",
+            "the sounds of shouting",
+            "scream",
+            "an agony yelling",
+            "a loud yelling",
+            "screaming",
+            "another person yell ",
+            "an adult male yells",
+            "old man yells",
+            "a man is screaming in panic and pain",
+            "man screaming",
+            "wild screaming",
+            "screams",
+            "a loud screaming",
+            "a series of shouts from a woman",
+            "a loud male voice screams",
+            "yells",
+            "people are shouting \"hip hip hooray\"",
+            "females shout",
+            "A loud yell pierced the air, commanding attention",
+            "The sound of the yell was sharp and sudden",
+            "A faint yell could be heard in the distance",
+            "The yell was abrupt and startling",
+            "A repetitive yell indicated urgency or excitement"
+        ]
+    },
+    {
+        "event": "yip",
+        "phrases": [
+            "a yipping sound",
+            "yipping",
+            "a dog yipping",
+            "a dog yips and pants",
+            "dogs are yipping",
+            "animal yipping",
+            "a dog is yipping loudly",
+            "dogs yip",
+            "a dog barks and yips",
+            "A sharp yip broke the silence",
+            "The sound of a yip was high-pitched and brief",
+            "A rhythmic series of yips indicated excitement",
+            "The yip was faint but distinct in the background",
+            "A sudden yip was heard from a small dog nearby"
+        ]
+    }
+]
\ No newline at end of file
diff --git a/utils/tests/test_logging.py b/utils/tests/test_logging.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ce9d080a7dd63c013ea7382e160d8b4ecfa6adb
--- /dev/null
+++ b/utils/tests/test_logging.py
@@ -0,0 +1,19 @@
+import unittest
+from pathlib import Path
+
+from utils.logging import LoggingLogger
+
+
+class TestLoggingLogger(unittest.TestCase):
+    def setUp(self):
+        self.tmp_log_path = Path("./tmp_logging.txt")
+
+    def test_logging_info(self):
+        logger = LoggingLogger(filename=self.tmp_log_path,
+                               level="INFO").create_instance()
+        logger.info("logging information")
+        self.assertTrue(self.tmp_log_path.exists())
+
+    def tearDown(self):
+        if self.tmp_log_path.exists():
+            self.tmp_log_path.unlink()
diff --git a/utils/torch_utilities.py b/utils/torch_utilities.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c85938b256bf274abac79ae1b7be222820c108e
--- /dev/null
+++ b/utils/torch_utilities.py
@@ -0,0 +1,168 @@
+import logging
+from typing import Callable
+from pathlib import Path
+import torch
+import torch.nn as nn
+
+logger = logging.Logger(__file__)
+
+
+def remove_key_prefix_factory(prefix: str = "module."):
+    def func(
+        model_dict: dict[str, torch.Tensor], state_dict: dict[str,
+                                                              torch.Tensor]
+    ) -> dict[str, torch.Tensor]:
+
+        state_dict = {
+            key[len(prefix):]: value
+            for key, value in state_dict.items() if key.startswith(prefix)
+        }
+        return state_dict
+
+    return func
+
+
+def merge_matched_keys(
+    model_dict: dict[str, torch.Tensor], state_dict: dict[str, torch.Tensor]
+) -> dict[str, torch.Tensor]:
+    """
+    Args:
+    model_dict:
+        The state dict of the current model, which is going to load pretrained parameters
+    state_dict:
+        A dictionary of parameters from a pre-trained model.
+
+    Returns:
+        dict[str, torch.Tensor]:
+            The updated state dict, where parameters with matched keys and shape are 
+            updated with values in `state_dict`.
+    """
+    pretrained_dict = {}
+    mismatch_keys = []
+    for key, value in state_dict.items():
+        if key in model_dict and model_dict[key].shape == value.shape:
+            pretrained_dict[key] = value
+        else:
+            mismatch_keys.append(key)
+    logger.info(
+        f"Loading pre-trained model, with mismatched keys {mismatch_keys}"
+    )
+    model_dict.update(pretrained_dict)
+    return model_dict
+
+
+def load_pretrained_model(
+    model: nn.Module,
+    ckpt_or_state_dict: str | Path | dict[str, torch.Tensor],
+    state_dict_process_fn: Callable = merge_matched_keys
+) -> None:
+    state_dict = ckpt_or_state_dict
+    if not isinstance(state_dict, dict):
+        state_dict = torch.load(ckpt_or_state_dict, "cpu")
+
+    model_dict = model.state_dict()
+    state_dict = state_dict_process_fn(model_dict, state_dict)
+    model.load_state_dict(state_dict, strict=False, assign=True)
+
+
+def create_mask_from_length(
+    lengths: torch.Tensor, max_length: int | None = None
+):
+    if max_length is None:
+        max_length = max(lengths)
+    idxs = torch.arange(max_length).reshape(1, -1)  # (1, max_length)
+    mask = idxs.to(lengths.device) < lengths.view(-1, 1)
+    # (1, max_length) < (batch_size, 1) -> (batch_size, max_length)
+    return mask
+
+
+def loss_with_mask(
+    loss: torch.Tensor,
+    mask: torch.Tensor,
+    reduce: bool = True
+) -> torch.Tensor:
+    """
+    Apply a mask to the loss tensor and optionally reduce it.
+
+    Args:
+        loss: Tensor of shape (b, t, ...) representing the loss values.
+        mask: Tensor of shape (b, t) where 1 indicates valid positions and 0 indicates masked positions.
+        reduce: If True, return a single scalar value; otherwise, return a tensor of shape (b,).
+
+    Returns:
+        torch.Tensor: A scalar if reduce is True, otherwise a tensor of shape (b,).
+    """
+    expanded_mask = mask[(..., ) + (None, ) * (loss.ndim - mask.ndim)]
+    expanded_mask = expanded_mask.expand_as(loss)
+    masked_loss = loss * expanded_mask
+
+    sum_dims = tuple(range(1, loss.ndim))
+    loss_sum = masked_loss.sum(dim=sum_dims)
+    mask_sum = expanded_mask.sum(dim=sum_dims)
+    loss = loss_sum / mask_sum
+
+    if reduce:
+        return loss.mean()
+    else:
+        return loss
+
+
+def convert_pad_shape(pad_shape: list[list[int]]):
+    l = pad_shape[::-1]
+    pad_shape = [item for sublist in l for item in sublist]
+    return pad_shape
+
+
+def create_alignment_path(duration: torch.Tensor, mask: torch.Tensor):
+    device = duration.device
+
+    b, t_x, t_y = mask.shape
+    cum_duration = torch.cumsum(duration, 1)
+    print(mask.shape)
+    print(duration.shape)
+    print(cum_duration.shape)
+    cum_duration_flat = cum_duration.view(b * t_x)
+    path = create_mask_from_length(cum_duration_flat, t_y).to(mask.dtype)
+    path = path.view(b, t_x, t_y)
+    # take the diff on the `t_x` axis
+    path = path - torch.nn.functional.pad(
+        path, convert_pad_shape([[0, 0], [1, 0], [0, 0]])
+    )[:, :-1]
+    path = path * mask
+    return path
+
+
+def trim_or_pad_length(x: torch.Tensor, target_length: int, length_dim: int):
+    """
+    Adjusts the size of the specified dimension of tensor x to match `target_length`.
+    
+    Args:
+        x:
+            Input tensor.
+        target_length: 
+            Desired size of the specified dimension.
+        length_dim: 
+            The dimension to modify.
+    
+    Returns:
+        torch.Tensor: The adjusted tensor.
+    """
+    current_length = x.shape[length_dim]
+
+    if current_length > target_length:
+        # Truncate the tensor
+        slices = [slice(None)] * x.ndim
+        slices[length_dim] = slice(0, target_length)
+        return x[tuple(slices)]
+
+    elif current_length < target_length:
+        # Pad the tensor
+        pad_shape = list(x.shape)
+        pad_length = target_length - current_length
+
+        pad_shape[length_dim] = pad_length  # Shape for left padding
+        padding = torch.zeros(pad_shape, dtype=x.dtype, device=x.device)
+
+        return torch.cat([x, padding], dim=length_dim)
+
+    return x