diff --git a/app.py b/app.py new file mode 100644 index 0000000000000000000000000000000000000000..e4a29e9483bbcf217cc2d89f37c6e9d04a01ab80 --- /dev/null +++ b/app.py @@ -0,0 +1,77 @@ +import gradio as gr +import os +import json +import torch +import soundfile as sf +import numpy as np +from pathlib import Path +from transformers import AutoModel +#from utils.llm import get_time_info +from utils.llm_xiapi import get_time_info + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +model = AutoModel.from_pretrained("rookie9/PicoAudio2", trust_remote_code=True).to(device) +print("ok") +def is_tdc_format_valid(tdc_str): + try: + for event_onset in tdc_str.split('--'): + event, instance = event_onset.split('__') + for start_end in instance.split('_'): + start, end = start_end.split('-') + return True + except Exception: + return False + +def infer(input_text, input_onset, input_length, time_control): + # para + if input_onset and not is_tdc_format_valid(input_onset): + input_onset = "random" + if time_control: + if not input_onset or not input_length: + input_json = json.loads(get_time_info(input_text)) + input_onset, input_length = input_json["onset"], input_json["length"] + else: + input_onset = input_onset if input_onset else "random" + input_length = input_length if input_length else "10.0" + + content = { + "caption": input_text, + "onset": input_onset, + "length": input_length + } + + + with torch.no_grad(): + waveform = model(content) + output_wav = "output.wav" + sf.write( + output_wav, + waveform[0, 0].cpu().numpy(), + samplerate=exp_config["sample_rate"], + ) + return output_wav, str(input_onset) + +demo = gr.Interface( + fn=infer, + inputs=[ + gr.Textbox(label="TCC (caption, required)", value="a dog barks"), + gr.Textbox(label="TDC (optional, see format)", value="random"), + gr.Textbox(label="Length (seconds, optional)", value="10.0"), + gr.Checkbox(label="Enable Time Control", value=False), + ], + outputs=[ + gr.Audio(label="Generated Audio"), + gr.Textbox(label="Final TDC Used (input_onset)") + ], + title="PicoAudio2 Online Inference", + description=( + "TCC (caption) is neto generate audio. " + "If you need time control, please enter TDC and length (in seconds). " + "Alternatively, you can let the LLM generate TDC, but API quota limits may affect availability. " + "TDC format: \"event1__start1-end1_start2-end2--event2__start1-end1\", for example: " + "\"a_dog_barks__1.0-2.0_3.0-4.0--a_man_speaks__5.0-6.0\"." + "If the format of TDC is wrong or no input length, the model will generate audio without temporal control. Sorry!" + ) +) +if __name__ == "__main__": + demo.launch() \ No newline at end of file diff --git a/models/__pycache__/common.cpython-310.pyc b/models/__pycache__/common.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e2945374926290bb58fd9d6af7dd55961592e3bf Binary files /dev/null and b/models/__pycache__/common.cpython-310.pyc differ diff --git a/models/__pycache__/content_adapter.cpython-310.pyc b/models/__pycache__/content_adapter.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d3de9f12f7f15396218a2177b2b598c30ab6cdc7 Binary files /dev/null and b/models/__pycache__/content_adapter.cpython-310.pyc differ diff --git a/models/__pycache__/diffusion.cpython-310.pyc b/models/__pycache__/diffusion.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ea2335bb198a2d9eb6c9f3adf8792caf2fe68dca Binary files /dev/null and b/models/__pycache__/diffusion.cpython-310.pyc differ diff --git a/models/__pycache__/diffusion_cfg.cpython-310.pyc b/models/__pycache__/diffusion_cfg.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6cb083c6ebda48738fc5885dcc5b1d620a667632 Binary files /dev/null and b/models/__pycache__/diffusion_cfg.cpython-310.pyc differ diff --git a/models/__pycache__/diffusion_cfg_new.cpython-310.pyc b/models/__pycache__/diffusion_cfg_new.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8946abbbfe664f7553d7545d28fcfa0812b32057 Binary files /dev/null and b/models/__pycache__/diffusion_cfg_new.cpython-310.pyc differ diff --git a/models/__pycache__/diffusion_content_cfg.cpython-310.pyc b/models/__pycache__/diffusion_content_cfg.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cb3b0cca20cd3c06339da4b0a57617a086f4eab6 Binary files /dev/null and b/models/__pycache__/diffusion_content_cfg.cpython-310.pyc differ diff --git a/models/autoencoder/__pycache__/autoencoder_base.cpython-310.pyc b/models/autoencoder/__pycache__/autoencoder_base.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4be9a8bcbcc162270f0cd469a80c82b148df1694 Binary files /dev/null and b/models/autoencoder/__pycache__/autoencoder_base.cpython-310.pyc differ diff --git a/models/autoencoder/autoencoder_base.py b/models/autoencoder/autoencoder_base.py new file mode 100644 index 0000000000000000000000000000000000000000..2852ad185b48e9595e116735baed689fa09cc0d3 --- /dev/null +++ b/models/autoencoder/autoencoder_base.py @@ -0,0 +1,22 @@ +from abc import abstractmethod, ABC +from typing import Sequence +import torch +import torch.nn as nn + + +class AutoEncoderBase(ABC): + def __init__( + self, downsampling_ratio: int, sample_rate: int, + latent_shape: Sequence[int | None] + ): + self.downsampling_ratio = downsampling_ratio + self.sample_rate = sample_rate + self.latent_token_rate = sample_rate // downsampling_ratio + self.latent_shape = latent_shape + self.time_dim = latent_shape.index(None) + 1 # the first dim is batch + + @abstractmethod + def encode( + self, waveform: torch.Tensor, waveform_lengths: torch.Tensor + ) -> tuple[torch.Tensor, torch.Tensor]: + ... diff --git a/models/autoencoder/waveform/__pycache__/stable_vae.cpython-310.pyc b/models/autoencoder/waveform/__pycache__/stable_vae.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2b167a38dcd8cbac4dd85da41050a0ec3cbea454 Binary files /dev/null and b/models/autoencoder/waveform/__pycache__/stable_vae.cpython-310.pyc differ diff --git a/models/autoencoder/waveform/stable_vae.py b/models/autoencoder/waveform/stable_vae.py new file mode 100644 index 0000000000000000000000000000000000000000..f7373ec13aca61a57dd5849735a745cf88b39e68 --- /dev/null +++ b/models/autoencoder/waveform/stable_vae.py @@ -0,0 +1,537 @@ +from typing import Any, Literal, Callable +import math +from pathlib import Path + +import torch +import torch.nn as nn +from torch.nn.utils.parametrizations import weight_norm +import torchaudio +from alias_free_torch import Activation1d + +from models.common import LoadPretrainedBase +from models.autoencoder.autoencoder_base import AutoEncoderBase +from utils.torch_utilities import remove_key_prefix_factory, create_mask_from_length + + +# jit script make it 1.4x faster and save GPU memory +@torch.jit.script +def snake_beta(x, alpha, beta): + return x + (1.0 / (beta+0.000000001)) * pow(torch.sin(x * alpha), 2) + + +class SnakeBeta(nn.Module): + def __init__( + self, + in_features, + alpha=1.0, + alpha_trainable=True, + alpha_logscale=True + ): + super(SnakeBeta, self).__init__() + self.in_features = in_features + + # initialize alpha + self.alpha_logscale = alpha_logscale + if self.alpha_logscale: + # log scale alphas initialized to zeros + self.alpha = nn.Parameter(torch.zeros(in_features) * alpha) + self.beta = nn.Parameter(torch.zeros(in_features) * alpha) + else: + # linear scale alphas initialized to ones + self.alpha = nn.Parameter(torch.ones(in_features) * alpha) + self.beta = nn.Parameter(torch.ones(in_features) * alpha) + + self.alpha.requires_grad = alpha_trainable + self.beta.requires_grad = alpha_trainable + + # self.no_div_by_zero = 0.000000001 + + def forward(self, x): + alpha = self.alpha.unsqueeze(0).unsqueeze(-1) + # line up with x to [B, C, T] + beta = self.beta.unsqueeze(0).unsqueeze(-1) + if self.alpha_logscale: + alpha = torch.exp(alpha) + beta = torch.exp(beta) + x = snake_beta(x, alpha, beta) + + return x + + +def WNConv1d(*args, **kwargs): + return weight_norm(nn.Conv1d(*args, **kwargs)) + + +def WNConvTranspose1d(*args, **kwargs): + return weight_norm(nn.ConvTranspose1d(*args, **kwargs)) + + +def get_activation( + activation: Literal["elu", "snake", "none"], + antialias=False, + channels=None +) -> nn.Module: + if activation == "elu": + act = nn.ELU() + elif activation == "snake": + act = SnakeBeta(channels) + elif activation == "none": + act = nn.Identity() + else: + raise ValueError(f"Unknown activation {activation}") + + if antialias: + act = Activation1d(act) + + return act + + +class ResidualUnit(nn.Module): + def __init__( + self, + in_channels, + out_channels, + dilation, + use_snake=False, + antialias_activation=False + ): + super().__init__() + + self.dilation = dilation + + padding = (dilation * (7-1)) // 2 + + self.layers = nn.Sequential( + get_activation( + "snake" if use_snake else "elu", + antialias=antialias_activation, + channels=out_channels + ), + WNConv1d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=7, + dilation=dilation, + padding=padding + ), + get_activation( + "snake" if use_snake else "elu", + antialias=antialias_activation, + channels=out_channels + ), + WNConv1d( + in_channels=out_channels, + out_channels=out_channels, + kernel_size=1 + ) + ) + + def forward(self, x): + res = x + + #x = checkpoint(self.layers, x) + x = self.layers(x) + + return x + res + + +class EncoderBlock(nn.Module): + def __init__( + self, + in_channels, + out_channels, + stride, + use_snake=False, + antialias_activation=False + ): + super().__init__() + + self.layers = nn.Sequential( + ResidualUnit( + in_channels=in_channels, + out_channels=in_channels, + dilation=1, + use_snake=use_snake + ), + ResidualUnit( + in_channels=in_channels, + out_channels=in_channels, + dilation=3, + use_snake=use_snake + ), + ResidualUnit( + in_channels=in_channels, + out_channels=in_channels, + dilation=9, + use_snake=use_snake + ), + get_activation( + "snake" if use_snake else "elu", + antialias=antialias_activation, + channels=in_channels + ), + WNConv1d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=2 * stride, + stride=stride, + padding=math.ceil(stride / 2) + ), + ) + + def forward(self, x): + return self.layers(x) + + +class DecoderBlock(nn.Module): + def __init__( + self, + in_channels, + out_channels, + stride, + use_snake=False, + antialias_activation=False, + use_nearest_upsample=False + ): + super().__init__() + + if use_nearest_upsample: + upsample_layer = nn.Sequential( + nn.Upsample(scale_factor=stride, mode="nearest"), + WNConv1d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=2 * stride, + stride=1, + bias=False, + padding='same' + ) + ) + else: + upsample_layer = WNConvTranspose1d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=2 * stride, + stride=stride, + padding=math.ceil(stride / 2) + ) + + self.layers = nn.Sequential( + get_activation( + "snake" if use_snake else "elu", + antialias=antialias_activation, + channels=in_channels + ), + upsample_layer, + ResidualUnit( + in_channels=out_channels, + out_channels=out_channels, + dilation=1, + use_snake=use_snake + ), + ResidualUnit( + in_channels=out_channels, + out_channels=out_channels, + dilation=3, + use_snake=use_snake + ), + ResidualUnit( + in_channels=out_channels, + out_channels=out_channels, + dilation=9, + use_snake=use_snake + ), + ) + + def forward(self, x): + return self.layers(x) + + +class OobleckEncoder(nn.Module): + def __init__( + self, + in_channels=2, + channels=128, + latent_dim=32, + c_mults=[1, 2, 4, 8], + strides=[2, 4, 8, 8], + use_snake=False, + antialias_activation=False + ): + super().__init__() + + c_mults = [1] + c_mults + + self.depth = len(c_mults) + + layers = [ + WNConv1d( + in_channels=in_channels, + out_channels=c_mults[0] * channels, + kernel_size=7, + padding=3 + ) + ] + + for i in range(self.depth - 1): + layers += [ + EncoderBlock( + in_channels=c_mults[i] * channels, + out_channels=c_mults[i + 1] * channels, + stride=strides[i], + use_snake=use_snake + ) + ] + + layers += [ + get_activation( + "snake" if use_snake else "elu", + antialias=antialias_activation, + channels=c_mults[-1] * channels + ), + WNConv1d( + in_channels=c_mults[-1] * channels, + out_channels=latent_dim, + kernel_size=3, + padding=1 + ) + ] + + self.layers = nn.Sequential(*layers) + + def forward(self, x): + return self.layers(x) + + +class OobleckDecoder(nn.Module): + def __init__( + self, + out_channels=2, + channels=128, + latent_dim=32, + c_mults=[1, 2, 4, 8], + strides=[2, 4, 8, 8], + use_snake=False, + antialias_activation=False, + use_nearest_upsample=False, + final_tanh=True + ): + super().__init__() + + c_mults = [1] + c_mults + + self.depth = len(c_mults) + + layers = [ + WNConv1d( + in_channels=latent_dim, + out_channels=c_mults[-1] * channels, + kernel_size=7, + padding=3 + ), + ] + + for i in range(self.depth - 1, 0, -1): + layers += [ + DecoderBlock( + in_channels=c_mults[i] * channels, + out_channels=c_mults[i - 1] * channels, + stride=strides[i - 1], + use_snake=use_snake, + antialias_activation=antialias_activation, + use_nearest_upsample=use_nearest_upsample + ) + ] + + layers += [ + get_activation( + "snake" if use_snake else "elu", + antialias=antialias_activation, + channels=c_mults[0] * channels + ), + WNConv1d( + in_channels=c_mults[0] * channels, + out_channels=out_channels, + kernel_size=7, + padding=3, + bias=False + ), + nn.Tanh() if final_tanh else nn.Identity() + ] + + self.layers = nn.Sequential(*layers) + + def forward(self, x): + return self.layers(x) + + +class Bottleneck(nn.Module): + def __init__(self, is_discrete: bool = False): + super().__init__() + + self.is_discrete = is_discrete + + def encode(self, x, return_info=False, **kwargs): + raise NotImplementedError + + def decode(self, x): + raise NotImplementedError + + +@torch.jit.script +def vae_sample(mean, scale) -> dict[str, torch.Tensor]: + stdev = nn.functional.softplus(scale) + 1e-4 + var = stdev * stdev + logvar = torch.log(var) + latents = torch.randn_like(mean) * stdev + mean + + kl = (mean*mean + var - logvar - 1).sum(1).mean() + return {"latents": latents, "kl": kl} + + +class VAEBottleneck(Bottleneck): + def __init__(self): + super().__init__(is_discrete=False) + + def encode(self, + x, + return_info=False, + **kwargs) -> dict[str, torch.Tensor] | torch.Tensor: + mean, scale = x.chunk(2, dim=1) + sampled = vae_sample(mean, scale) + + if return_info: + return sampled["latents"], {"kl": sampled["kl"]} + else: + return sampled["latents"] + + def decode(self, x): + return x + + +def compute_mean_kernel(x, y): + kernel_input = (x[:, None] - y[None]).pow(2).mean(2) / x.shape[-1] + return torch.exp(-kernel_input).mean() + + +class Pretransform(nn.Module): + def __init__(self, enable_grad, io_channels, is_discrete): + super().__init__() + + self.is_discrete = is_discrete + self.io_channels = io_channels + self.encoded_channels = None + self.downsampling_ratio = None + + self.enable_grad = enable_grad + + def encode(self, x): + raise NotImplementedError + + def decode(self, z): + raise NotImplementedError + + def tokenize(self, x): + raise NotImplementedError + + def decode_tokens(self, tokens): + raise NotImplementedError + + +class StableVAE(LoadPretrainedBase, AutoEncoderBase): + def __init__( + self, + encoder, + decoder, + latent_dim, + downsampling_ratio, + sample_rate, + io_channels=2, + bottleneck: Bottleneck = None, + pretransform: Pretransform = None, + in_channels=None, + out_channels=None, + soft_clip=False, + pretrained_ckpt: str | Path = None + ): + LoadPretrainedBase.__init__(self) + AutoEncoderBase.__init__( + self, + downsampling_ratio=downsampling_ratio, + sample_rate=sample_rate, + latent_shape=(latent_dim, None) + ) + + self.latent_dim = latent_dim + self.io_channels = io_channels + self.in_channels = io_channels + self.out_channels = io_channels + self.min_length = self.downsampling_ratio + + if in_channels is not None: + self.in_channels = in_channels + + if out_channels is not None: + self.out_channels = out_channels + + self.bottleneck = bottleneck + self.encoder = encoder + self.decoder = decoder + self.pretransform = pretransform + self.soft_clip = soft_clip + self.is_discrete = self.bottleneck is not None and self.bottleneck.is_discrete + + self.remove_autoencoder_prefix_fn: Callable = remove_key_prefix_factory( + "autoencoder." + ) + if pretrained_ckpt is not None: + self.load_pretrained(pretrained_ckpt) + + def process_state_dict(self, model_dict, state_dict): + state_dict = state_dict["state_dict"] + state_dict = self.remove_autoencoder_prefix_fn(model_dict, state_dict) + return state_dict + + def encode( + self, waveform: torch.Tensor, waveform_lengths: torch.Tensor + ) -> tuple[torch.Tensor, torch.Tensor]: + z = self.encoder(waveform) + z = self.bottleneck.encode(z) + z_length = waveform_lengths // self.downsampling_ratio + z_mask = create_mask_from_length(z_length) + return z, z_mask + + def decode(self, latents: torch.Tensor) -> torch.Tensor: + waveform = self.decoder(latents) + return waveform + + +if __name__ == '__main__': + import hydra + from utils.config import generate_config_from_command_line_overrides + model_config = generate_config_from_command_line_overrides( + "configs/model/autoencoder/stable_vae.yaml" + ) + autoencoder: StableVAE = hydra.utils.instantiate(model_config) + autoencoder.eval() + + waveform, sr = torchaudio.load( + "/hpc_stor03/sjtu_home/xuenan.xu/workspace/singing_voice_synthesis/diffsinger/data/raw/opencpop/segments/wavs/2007000230.wav" + ) + waveform = torchaudio.functional.resample( + waveform, sr, model_config["sample_rate"] + ) + print("waveform: ", waveform.shape) + with torch.no_grad(): + latent, latent_length = autoencoder.encode( + waveform, torch.as_tensor([waveform.shape[-1]]) + ) + print("latent: ", latent.shape) + reconstructed = autoencoder.decode(latent) + print("reconstructed: ", reconstructed.shape) + import soundfile as sf + sf.write( + "./reconstructed.wav", + reconstructed[0, 0].numpy(), + samplerate=model_config["sample_rate"] + ) diff --git a/models/common.py b/models/common.py new file mode 100644 index 0000000000000000000000000000000000000000..a4832f8fb67dda9cb608f6874b2b4f17ac20442a --- /dev/null +++ b/models/common.py @@ -0,0 +1,69 @@ +from pathlib import Path +import torch +import torch.nn as nn +from utils.torch_utilities import load_pretrained_model, merge_matched_keys +import warnings + +class LoadPretrainedBase(nn.Module): + def process_state_dict( + self, model_dict: dict[str, torch.Tensor], + state_dict: dict[str, torch.Tensor] + ): + """ + Custom processing functions of each model that transforms `state_dict` loaded from + checkpoints to the state that can be used in `load_state_dict`. + Use `merge_mathced_keys` to update parameters with matched names and shapes by + default. + + Args + model_dict: + The state dict of the current model, which is going to load pretrained parameters + state_dict: + A dictionary of parameters from a pre-trained model. + + Returns: + dict[str, torch.Tensor]: + The updated state dict, where parameters with matched keys and shape are + updated with values in `state_dict`. + """ + state_dict = merge_matched_keys(model_dict, state_dict) + return state_dict + + def load_pretrained(self, ckpt_path: str | Path): + load_pretrained_model( + self, ckpt_path, state_dict_process_fn=self.process_state_dict + ) + + +class CountParamsBase(nn.Module): + def count_params(self): + num_params = 0 + trainable_params = 0 + for param in self.parameters(): + num_params += param.numel() + if param.requires_grad: + trainable_params += param.numel() + return num_params, trainable_params + + +class SaveTrainableParamsBase(nn.Module): + @property + def param_names_to_save(self): + names = [] + for name, param in self.named_parameters(): + if param.requires_grad: + names.append(name) + for name, _ in self.named_buffers(): + names.append(name) + return names + + def load_state_dict(self, state_dict, strict=True, assign=True): + print("State dict keys:", list(state_dict.keys())) + #for key in self.param_names_to_save: + # if key not in state_dict: + # raise Exception( + # f"{key} not found in either pre-trained models (e.g. BERT)" + # " or resumed checkpoints (e.g. epoch_40/model.pt)" + # ) + # 兼容 PyTorch/transformers 的 assign 参数 + return super().load_state_dict(state_dict, strict=strict, assign=assign) \ No newline at end of file diff --git a/models/content_encoder/__pycache__/caption_encoder.cpython-310.pyc b/models/content_encoder/__pycache__/caption_encoder.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5e0552905613763ed5a88794610d9dbe05a10131 Binary files /dev/null and b/models/content_encoder/__pycache__/caption_encoder.cpython-310.pyc differ diff --git a/models/content_encoder/__pycache__/content_encoder.cpython-310.pyc b/models/content_encoder/__pycache__/content_encoder.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..33efd478b08e3f719abf053f4f520eb7302f5371 Binary files /dev/null and b/models/content_encoder/__pycache__/content_encoder.cpython-310.pyc differ diff --git a/models/content_encoder/__pycache__/content_encoder_add_1024.cpython-310.pyc b/models/content_encoder/__pycache__/content_encoder_add_1024.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d40ca4898f7470a736c2a171a1e38a7d2ab9d9ca Binary files /dev/null and b/models/content_encoder/__pycache__/content_encoder_add_1024.cpython-310.pyc differ diff --git a/models/content_encoder/__pycache__/content_encoder_clap.cpython-310.pyc b/models/content_encoder/__pycache__/content_encoder_clap.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..79adb09e0091180f1bc35847071909812d972d1d Binary files /dev/null and b/models/content_encoder/__pycache__/content_encoder_clap.cpython-310.pyc differ diff --git a/models/content_encoder/__pycache__/content_encoder_clap_test.cpython-310.pyc b/models/content_encoder/__pycache__/content_encoder_clap_test.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e012f9e93ee6c113928c4dc360d46da1972f18fa Binary files /dev/null and b/models/content_encoder/__pycache__/content_encoder_clap_test.cpython-310.pyc differ diff --git a/models/content_encoder/__pycache__/content_encoder_concat.cpython-310.pyc b/models/content_encoder/__pycache__/content_encoder_concat.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cf6a282b5699a8cd3326d6fdc3b92909399af824 Binary files /dev/null and b/models/content_encoder/__pycache__/content_encoder_concat.cpython-310.pyc differ diff --git a/models/content_encoder/__pycache__/content_encoder_concat_4096.cpython-310.pyc b/models/content_encoder/__pycache__/content_encoder_concat_4096.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2eb75653d1b41deef2d121517f4ef456f9c0c1ac Binary files /dev/null and b/models/content_encoder/__pycache__/content_encoder_concat_4096.cpython-310.pyc differ diff --git a/models/content_encoder/__pycache__/content_encoder_concat_4096_random.cpython-310.pyc b/models/content_encoder/__pycache__/content_encoder_concat_4096_random.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5d333ccde4745c521f0e667eb16c19dc084e9f03 Binary files /dev/null and b/models/content_encoder/__pycache__/content_encoder_concat_4096_random.cpython-310.pyc differ diff --git a/models/content_encoder/__pycache__/content_encoder_full.cpython-310.pyc b/models/content_encoder/__pycache__/content_encoder_full.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9e7b41dfaf5f32cd73c6cb0b99541b9f1ddc042a Binary files /dev/null and b/models/content_encoder/__pycache__/content_encoder_full.cpython-310.pyc differ diff --git a/models/content_encoder/__pycache__/content_encoder_full_non.cpython-310.pyc b/models/content_encoder/__pycache__/content_encoder_full_non.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..19b96e898a962712ea8ae84b21ce7a24613d1d4c Binary files /dev/null and b/models/content_encoder/__pycache__/content_encoder_full_non.cpython-310.pyc differ diff --git a/models/content_encoder/__pycache__/content_encoder_full_non_test.cpython-310.pyc b/models/content_encoder/__pycache__/content_encoder_full_non_test.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..179970ea364694e59ca267f3858b339c3720c21d Binary files /dev/null and b/models/content_encoder/__pycache__/content_encoder_full_non_test.cpython-310.pyc differ diff --git a/models/content_encoder/__pycache__/content_encoder_full_test.cpython-310.pyc b/models/content_encoder/__pycache__/content_encoder_full_test.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f97e423fc2c523ff9fc6272dc96757eee3dd34dd Binary files /dev/null and b/models/content_encoder/__pycache__/content_encoder_full_test.cpython-310.pyc differ diff --git a/models/content_encoder/__pycache__/content_encoder_full_woonset.cpython-310.pyc b/models/content_encoder/__pycache__/content_encoder_full_woonset.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..37952ea021a63ed12e1ed07d1073cb2f0abdee00 Binary files /dev/null and b/models/content_encoder/__pycache__/content_encoder_full_woonset.cpython-310.pyc differ diff --git a/models/content_encoder/__pycache__/content_encoder_merge.cpython-310.pyc b/models/content_encoder/__pycache__/content_encoder_merge.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3ab920c0c1e5ced61a6b1856cf5305c7a2b2746e Binary files /dev/null and b/models/content_encoder/__pycache__/content_encoder_merge.cpython-310.pyc differ diff --git a/models/content_encoder/__pycache__/content_encoder_merge_test.cpython-310.pyc b/models/content_encoder/__pycache__/content_encoder_merge_test.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..02b660998b3b2bc248603d52ffbdf8ba1b0a83ad Binary files /dev/null and b/models/content_encoder/__pycache__/content_encoder_merge_test.cpython-310.pyc differ diff --git a/models/content_encoder/__pycache__/content_encoder_replace.cpython-310.pyc b/models/content_encoder/__pycache__/content_encoder_replace.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4e3d2b38176b73a4dc3683a7b9b990fb05b44157 Binary files /dev/null and b/models/content_encoder/__pycache__/content_encoder_replace.cpython-310.pyc differ diff --git a/models/content_encoder/__pycache__/content_encoder_replace_merge.cpython-310.pyc b/models/content_encoder/__pycache__/content_encoder_replace_merge.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fd5b37a17053da1f29a3c9d93352ba3f76dcff51 Binary files /dev/null and b/models/content_encoder/__pycache__/content_encoder_replace_merge.cpython-310.pyc differ diff --git a/models/content_encoder/__pycache__/content_encoder_replace_new.cpython-310.pyc b/models/content_encoder/__pycache__/content_encoder_replace_new.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..731dd53850c2d82afc8aa66a3d0e2e6711dd41c5 Binary files /dev/null and b/models/content_encoder/__pycache__/content_encoder_replace_new.cpython-310.pyc differ diff --git a/models/content_encoder/__pycache__/content_encoder_test.cpython-310.pyc b/models/content_encoder/__pycache__/content_encoder_test.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3cfd15e674bfb421c1f7516ff6e9b594ccfc11ce Binary files /dev/null and b/models/content_encoder/__pycache__/content_encoder_test.cpython-310.pyc differ diff --git a/models/content_encoder/__pycache__/content_test.cpython-310.pyc b/models/content_encoder/__pycache__/content_test.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..05a6d2efc1a9bb64543023ba3f08c3e389bc6e6b Binary files /dev/null and b/models/content_encoder/__pycache__/content_test.cpython-310.pyc differ diff --git a/models/content_encoder/__pycache__/new_content_encoder.cpython-310.pyc b/models/content_encoder/__pycache__/new_content_encoder.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..95b289a672b9819bccf7cb334e3f93ffe02b53b5 Binary files /dev/null and b/models/content_encoder/__pycache__/new_content_encoder.cpython-310.pyc differ diff --git a/models/content_encoder/__pycache__/text_encoder.cpython-310.pyc b/models/content_encoder/__pycache__/text_encoder.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..08cfbf6e6a50215a83462c23f932331b30eee53a Binary files /dev/null and b/models/content_encoder/__pycache__/text_encoder.cpython-310.pyc differ diff --git a/models/content_encoder/caption_encoder.py b/models/content_encoder/caption_encoder.py new file mode 100644 index 0000000000000000000000000000000000000000..967d59bd4fe1453959d7d2493718f19eb888b18d --- /dev/null +++ b/models/content_encoder/caption_encoder.py @@ -0,0 +1,116 @@ +from typing import Any +import torch +import torch.nn as nn +import random +from utils.audiotime_event_merge import replace_event_synonyms + +def decode_data(line_onset_str, latent_length): + """ + Extracts a timestamp matrix (event onset indices) from a formatted onset string. + + Args: + line_onset_str (str): String containing event names and onset intervals, + formatted like "event1__start1-end1_start2-end2--event2__start1-end1". + latent_length (int): Length of the output matrix. + + Returns: + line_onset_index (torch.Tensor): Matrix of shape [4, latent_length], + line_event (list): List of event names extracted from the onset string. + + Notes: + - 24000 is the audio sample rate. + - 480 is the downsample ratio to align with VAE. + - Each onset interval "start-end" (in seconds) is converted to embedding indices via (time * 24000 / 480). + """ + line_onset_index = torch.zeros((4, latent_length)) # max for 4 events + line_event = [] + event_idx = 0 + for event_onset in line_onset_str.split('--'): + #print(event_onset) + (event, instance) = event_onset.split('__') + #print(instance) + line_event.append(event) + for start_end in instance.split('_'): + (start, end) = start_end.split('-') + start, end = int(float(start)*24000/480), int(float(end)*24000/480) + if end > (latent_length - 1): break + line_onset_index[event_idx, start: end] = 1 + event_idx = event_idx + 1 + return line_onset_index, line_event + + +class ContentEncoder(nn.Module): + """ + ContentEncoder encodes TCC and TDC information. + """ + def __init__( + self, + text_encoder: nn.Module= None, + ): + super().__init__() + self.text_encoder = text_encoder + self.pool = nn.AdaptiveAvgPool1d(1) + + def encode_content( + self, batch_content: list[Any], device: str | torch.device + ): + batch_output = [] + batch_mask = [] + batch_onset = [] + length_list = [] + print(batch_content) + for content in batch_content: + + caption = content["caption"] + onset = content["onset"] + length = int(float(content["length"]) *24000/480) + # Replacement for AudioTime + print(onset) + replace_label = content.get("replace_label", "False") + if replace_label == "True": + caption, onset = replace_event_synonyms(caption, onset) + + # Handle random onset case for read data without timestamp + if content["onset"] == "random": + length_list.append(length) + """ + fixed embedding. Actually it's a sick sentence, a error during training, kept to match the checkpoint. + You can change it to sentence that difference to captions in datasets. + The use of fixed text to obtain encoding is for numerical stability. + We attempted to use learnable unified encoding during training, but the results were not satisfactory. + """ + event = "There is no event here" + event_embed = self.text_encoder([event.replace("_", " ")])["output"] + event_embed = self.pool(event_embed.permute(0, 2, 1)) # (B, 1024, 1) + event_embed = event_embed.flatten().unsqueeze(0) + new_onset = event_embed.repeat(length, 1).T + else: + onset_matrix, events = decode_data(onset, length) + length_list.append(length) + new_onset = torch.zeros((1024, length), device=device) # 1024 for T5 + # TDC + for (idx, event) in enumerate(events): + with torch.no_grad(): + event_embed = self.text_encoder([event.replace("_", " ")])["output"] + event_embed = self.pool(event_embed.permute(0, 2, 1)) # (B, 1024, 1) + event_embed = event_embed.flatten().unsqueeze(0) + mask = (onset_matrix[idx, :] == 0) + cols = mask.nonzero(as_tuple=True)[0] + new_onset[:, cols] += event_embed.T.float() + # TCC + output_dict = self.text_encoder([caption]) + batch_output.append(output_dict["output"][0]) + batch_mask.append(output_dict["mask"][0]) + batch_onset.append(new_onset) + + # Pad all sequences in the batch to the same length for batching + batch_output = nn.utils.rnn.pad_sequence( + batch_output, batch_first=True, padding_value=0 + ) + batch_mask = nn.utils.rnn.pad_sequence( + batch_mask, batch_first=True, padding_value=False + ) + batch_onset = nn.utils.rnn.pad_sequence( + batch_onset, batch_first=True, padding_value=0 + ) + return batch_output, batch_mask, batch_onset, length_list diff --git a/models/content_encoder/text_encoder.py b/models/content_encoder/text_encoder.py new file mode 100644 index 0000000000000000000000000000000000000000..c830be27b0aad930945d8578b047bcf6618d07ff --- /dev/null +++ b/models/content_encoder/text_encoder.py @@ -0,0 +1,76 @@ +import torch +import torch.nn as nn +from transformers import AutoTokenizer, AutoModel, T5Tokenizer, T5EncoderModel +from transformers.modeling_outputs import BaseModelOutput + +try: + import torch_npu + from torch_npu.contrib import transfer_to_npu + DEVICE_TYPE = "npu" +except ModuleNotFoundError: + DEVICE_TYPE = "cuda" + + +class TransformersTextEncoderBase(nn.Module): + """ + Base class for text encoding using HuggingFace Transformers models. + + """ + def __init__(self, model_name: str): + super().__init__() + self.tokenizer = AutoTokenizer.from_pretrained(model_name) + self.model = AutoModel.from_pretrained(model_name) + + def forward( + self, + text: list[str], + ): + device = self.model.device + batch = self.tokenizer( + text, + max_length=self.tokenizer.model_max_length, + padding=True, + truncation=True, + return_tensors="pt" + ) + input_ids = batch.input_ids.to(device) + attention_mask = batch.attention_mask.to(device) + output: BaseModelOutput = self.model( + input_ids=input_ids, attention_mask=attention_mask + ) + output = output.last_hidden_state + mask = (attention_mask == 1).to(device) + + return {"output": output, "mask": mask} + + +class T5TextEncoder(TransformersTextEncoderBase): + """ + Text encoder using T5 encoder model. + """ + def __init__(self, model_name: str = "/mnt/petrelfs/zhengzihao/cache/google-flan-t5-large"): + nn.Module.__init__(self) + self.tokenizer = T5Tokenizer.from_pretrained(model_name) + self.model = T5EncoderModel.from_pretrained(model_name) + for param in self.model.parameters(): + param.requires_grad = False + self.eval() + + def forward( + self, + text: list[str], + ): + with torch.no_grad(), torch.amp.autocast( + device_type=DEVICE_TYPE, enabled=False + ): + return super().forward(text) + + +if __name__ == '__main__': + text_encoder = T5TextEncoder() + text = ["dog barking and cat moving"] + text_encoder.eval() + with torch.no_grad(): + output = text_encoder(text) + print(output["output"].shape) + #print(output) \ No newline at end of file diff --git a/models/diffusion.py b/models/diffusion.py new file mode 100644 index 0000000000000000000000000000000000000000..f4363261f44eea0807991c494a6e1af161a8b380 --- /dev/null +++ b/models/diffusion.py @@ -0,0 +1,398 @@ +from typing import Sequence +import random +from typing import Any + +from tqdm import tqdm +import torch +import torch.nn as nn +import torch.nn.functional as F +import diffusers.schedulers as noise_schedulers +from diffusers.schedulers.scheduling_utils import SchedulerMixin +from diffusers.utils.torch_utils import randn_tensor + +import numpy as np +from models.autoencoder.autoencoder_base import AutoEncoderBase +from models.content_encoder.caption_encoder import ContentEncoder +from models.common import LoadPretrainedBase, CountParamsBase, SaveTrainableParamsBase +from utils.torch_utilities import ( + create_alignment_path, create_mask_from_length, loss_with_mask, + trim_or_pad_length +) + + +class DiffusionMixin: + def __init__( + self, + noise_scheduler_name: str = "stabilityai/stable-diffusion-2-1", + snr_gamma: float = None, + classifier_free_guidance: bool = True, + cfg_drop_ratio: float = 0.2, + + ) -> None: + self.noise_scheduler_name = noise_scheduler_name + self.snr_gamma = snr_gamma + self.classifier_free_guidance = classifier_free_guidance + self.cfg_drop_ratio = cfg_drop_ratio + self.noise_scheduler = noise_schedulers.DDIMScheduler.from_pretrained( + self.noise_scheduler_name, subfolder="scheduler" + ) + + def compute_snr(self, timesteps) -> torch.Tensor: + """ + Computes SNR as per https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/blob/521b624bd70c67cee4bdf49225915f5945a872e3/guided_diffusion/gaussian_diffusion.py#L847-L849 + """ + alphas_cumprod = self.noise_scheduler.alphas_cumprod + sqrt_alphas_cumprod = alphas_cumprod**0.5 + sqrt_one_minus_alphas_cumprod = (1.0 - alphas_cumprod)**0.5 + + # Expand the tensors. + # Adapted from https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/blob/521b624bd70c67cee4bdf49225915f5945a872e3/guided_diffusion/gaussian_diffusion.py#L1026 + sqrt_alphas_cumprod = sqrt_alphas_cumprod.to(device=timesteps.device + )[timesteps].float() + while len(sqrt_alphas_cumprod.shape) < len(timesteps.shape): + sqrt_alphas_cumprod = sqrt_alphas_cumprod[..., None] + alpha = sqrt_alphas_cumprod.expand(timesteps.shape) + + sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod.to( + device=timesteps.device + )[timesteps].float() + while len(sqrt_one_minus_alphas_cumprod.shape) < len(timesteps.shape): + sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod[..., + None] + sigma = sqrt_one_minus_alphas_cumprod.expand(timesteps.shape) + + # Compute SNR. + snr = (alpha / sigma)**2 + return snr + + def get_timesteps( + self, + batch_size: int, + device: torch.device, + training: bool = True + ) -> torch.Tensor: + if training: + timesteps = torch.randint( + 0, + self.noise_scheduler.config.num_train_timesteps, + (batch_size, ), + device=device + ) + else: + # validation on half of the total timesteps + timesteps = (self.noise_scheduler.config.num_train_timesteps // + 2) * torch.ones((batch_size, ), + dtype=torch.int64, + device=device) + + timesteps = timesteps.long() + return timesteps + + def get_target( + self, latent: torch.Tensor, noise: torch.Tensor, + timesteps: torch.Tensor + ) -> torch.Tensor: + """ + Get the target for loss depending on the prediction type + """ + if self.noise_scheduler.config.prediction_type == "epsilon": + target = noise + elif self.noise_scheduler.config.prediction_type == "v_prediction": + target = self.noise_scheduler.get_velocity( + latent, noise, timesteps + ) + else: + raise ValueError( + f"Unknown prediction type {self.noise_scheduler.config.prediction_type}" + ) + return target + + def loss_with_snr( + self, pred: torch.Tensor, target: torch.Tensor, + timesteps: torch.Tensor, mask: torch.Tensor + ) -> torch.Tensor: + if self.snr_gamma is None: + loss = F.mse_loss(pred.float(), target.float(), reduction="none") + loss = loss_with_mask(loss, mask) + else: + # Compute loss-weights as per Section 3.4 of https://arxiv.org/abs/2303.09556. + # Adaptef from huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image.py + snr = self.compute_snr(timesteps) + mse_loss_weights = ( + torch.stack([snr, self.snr_gamma * torch.ones_like(timesteps)], + dim=1).min(dim=1)[0] / snr + ) + loss = F.mse_loss(pred.float(), target.float(), reduction="none") + loss = loss_with_mask(loss, mask, reduce=False) * mse_loss_weights + loss = loss.mean() + return loss + + +class AudioDiffusion( + LoadPretrainedBase, CountParamsBase, SaveTrainableParamsBase, + DiffusionMixin +): + """ + Args: + autoencoder (AutoEncoderBase): Pretrained autoencoder module VAE(frozen). + content_encoder (ContentEncoder): Encodes TCC and TDC information. + backbone (nn.Module): Main denoising network. + frame_resolution (float): Resolution for audio frames. + noise_scheduler_name (str): Noise scheduler identifier. + snr_gamma (float, optional): SNR gamma for noise scheduler. + classifier_free_guidance (bool): Enable classifier-free guidance. + cfg_drop_ratio (float): Ratio for randomly dropping context for classifier-free guidance. + """ + def __init__( + self, + autoencoder: AutoEncoderBase, + content_encoder: ContentEncoder, + backbone: nn.Module, + frame_resolution:float, + noise_scheduler_name: str = "stabilityai/stable-diffusion-2-1", + snr_gamma: float = None, + classifier_free_guidance: bool = True, + cfg_drop_ratio: float = 0.2, + ): + nn.Module.__init__(self) + DiffusionMixin.__init__( + self, noise_scheduler_name, snr_gamma, classifier_free_guidance, cfg_drop_ratio + ) + + self.autoencoder = autoencoder + # Freeze autoencoder parameters + for param in self.autoencoder.parameters(): + param.requires_grad = False + + self.content_encoder = content_encoder + self.backbone = backbone + self.frame_resolution = frame_resolution + self.dummy_param = nn.Parameter(torch.empty(0)) + + def forward( + self, content: list[Any], condition: list[Any], task: list[str], + waveform: torch.Tensor, waveform_lengths: torch.Tensor, **kwargs + ): + """ + Training forward pass. + + Args: + content (list[Any]): List of content dicts for each sample. + condition (list[Any]): Conditioning information (unused here). + task (list[str]): List of task types. + waveform (Tensor): Batch of waveform tensors. + waveform_lengths (Tensor): Lengths for each waveform sample. + + Returns: + dict: Dictionary containing the diffusion loss. + """ + device = self.dummy_param.device + num_train_timesteps = self.noise_scheduler.config.num_train_timesteps + self.noise_scheduler.set_timesteps(num_train_timesteps, device=device) + + self.autoencoder.eval() + with torch.no_grad(): + latent, latent_mask = self.autoencoder.encode( + waveform.unsqueeze(1), waveform_lengths + ) + # content(non_time_aligned_content) for TCC and time_aligned_content for TDC + content, content_mask, onset, _= self.content_encoder.encode_content( + content, device=device + ) + + # prepare latent and diffusion-related noise + time_aligned_content = onset.permute(0,2,1) + if self.training and self.classifier_free_guidance: + mask_indices = [ + k for k in range(len(waveform)) if random.random() < self.cfg_drop_ratio + ] + if len(mask_indices) > 0: + content[mask_indices] = 0 + time_aligned_content[mask_indices] = 0 + + batch_size = latent.shape[0] + timesteps = self.get_timesteps(batch_size, device, self.training) + noise = torch.randn_like(latent) + noisy_latent = self.noise_scheduler.add_noise(latent, noise, timesteps) + target = self.get_target(latent, noise, timesteps) + + # Denoising prediction + pred: torch.Tensor = self.backbone( + x=noisy_latent, + timesteps=timesteps, + time_aligned_context=time_aligned_content, + context=content, + x_mask=latent_mask, + context_mask=content_mask + ) + pred = pred.transpose(1, self.autoencoder.time_dim) + target = target.transpose(1, self.autoencoder.time_dim) + diff_loss = self.loss_with_snr(pred, target, timesteps, latent_mask) + return { + "diff_loss": diff_loss, + } + + @torch.no_grad() + def inference( + self, + content: list[Any], + num_steps: int = 20, + guidance_scale: float = 3.0, + guidance_rescale: float = 0.0, + disable_progress: bool = True, + num_samples_per_content: int = 1, + **kwargs + ): + """ + Inference/generation method for audio diffusion. + + Args: + content (list[Any]): List of content dicts. + scheduler (SchedulerMixin): Scheduler for timesteps and noise. + num_steps (int): Number of denoising steps. + guidance_scale (float): Classifier-free guidance scale. + guidance_rescale (float): Rescale factor for guidance. + disable_progress (bool): Disable progress bar. + num_samples_per_content (int): How many samples to generate per content. + + Returns: + waveform (Tensor): Generated waveform. + """ + device = self.dummy_param.device + classifier_free_guidance = guidance_scale > 1.0 + batch_size = len(content) * num_samples_per_content + print(content) + if classifier_free_guidance: + content, content_mask, onset, length_list = self.encode_content_classifier_free( + content, num_samples_per_content + ) + else: + content, content_mask, onset, length_list = self.content_encoder.encode_content( + content, device=device + ) + content = content.repeat_interleave(num_samples_per_content, 0) + content_mask = content_mask.repeat_interleave( + num_samples_per_content, 0 + ) + + self.noise_scheduler.set_timesteps(num_steps, device=device) + timesteps = self.noise_scheduler.timesteps + + + # prepare input latent and context for the backbone + shape = (batch_size, 128, onset.shape[2]) # 128 for StableVAE channels + time_aligned_content = onset.permute(0,2,1) + latent = randn_tensor( + shape, generator=None, device=device, dtype=content.dtype + ) + + # scale the initial noise by the standard deviation required by the scheduler + latent = latent * self.noise_scheduler.init_noise_sigma + latent_mask = torch.full((batch_size, onset.shape[2]), False, device=device) + + for i, length in enumerate(length_list): + # Set latent mask True for valid time steps for each sample + latent_mask[i, :length] = True + num_warmup_steps = len(timesteps) - num_steps * self.noise_scheduler.order + progress_bar = tqdm(range(num_steps), disable=disable_progress) + + if classifier_free_guidance: + uncond_time_aligned_content = torch.zeros_like( + time_aligned_content + ) + time_aligned_content = torch.cat( + [uncond_time_aligned_content, time_aligned_content] + ) + latent_mask = torch.cat( + [latent_mask, latent_mask.detach().clone()] + ) + + # iteratively denoising + + for i, timestep in enumerate(timesteps): + + latent_input = torch.cat( + [latent, latent] + ) if classifier_free_guidance else latent + latent_input = self.noise_scheduler.scale_model_input(latent_input, timestep) + + noise_pred = self.backbone( + x=latent_input, + x_mask=latent_mask, + timesteps=timestep, + time_aligned_context=time_aligned_content, + context=content, + context_mask=content_mask, + ) + + if classifier_free_guidance: + noise_pred_uncond, noise_pred_content = noise_pred.chunk(2) + noise_pred = noise_pred_uncond + guidance_scale * ( + noise_pred_content - noise_pred_uncond + ) + if guidance_rescale != 0.0: + noise_pred = self.rescale_cfg( + noise_pred_content, noise_pred, guidance_rescale + ) + # compute the previous noisy sample x_t -> x_t-1 + latent = self.noise_scheduler.step(noise_pred, timestep, latent).prev_sample + + # call the callback, if provided + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and + (i+1) % self.noise_scheduler.order == 0): + progress_bar.update(1) + #latent = latent.to(next(self.autoencoder.parameters()).device) + waveform = self.autoencoder.decode(latent) + return waveform + + def encode_content_classifier_free( + self, + content: list[Any], + task: list[str], + num_samples_per_content: int = 1 + ): + device = self.dummy_param.device + + content, content_mask, onset, length_list = self.content_encoder.encode_content( + content, device=device + ) + content = content.repeat_interleave(num_samples_per_content, 0) + content_mask = content_mask.repeat_interleave( + num_samples_per_content, 0 + ) + + # get unconditional embeddings for classifier free guidance + uncond_content = torch.zeros_like(content) + uncond_content_mask = content_mask.detach().clone() + + uncond_content = uncond_content.repeat_interleave( + num_samples_per_content, 0 + ) + uncond_content_mask = uncond_content_mask.repeat_interleave( + num_samples_per_content, 0 + ) + + # For classifier free guidance, we need to do two forward passes. + # We concatenate the unconditional and text embeddings into a single batch to avoid doing two forward passes + content = torch.cat([uncond_content, content]) + content_mask = torch.cat([uncond_content_mask, content_mask]) + + return content, content_mask, onset, length_list + + def rescale_cfg( + self, pred_cond: torch.Tensor, pred_cfg: torch.Tensor, + guidance_rescale: float + ): + """ + Rescale `pred_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and + Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4 + """ + std_cond = pred_cond.std( + dim=list(range(1, pred_cond.ndim)), keepdim=True + ) + std_cfg = pred_cfg.std(dim=list(range(1, pred_cfg.ndim)), keepdim=True) + + pred_rescaled = pred_cfg * (std_cond / std_cfg) + pred_cfg = guidance_rescale * pred_rescaled + ( + 1 - guidance_rescale + ) * pred_cfg diff --git a/models/dit/__pycache__/attention.cpython-310.pyc b/models/dit/__pycache__/attention.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2b07aff7cc03ba914ae53b19b0806f684b364ca0 Binary files /dev/null and b/models/dit/__pycache__/attention.cpython-310.pyc differ diff --git a/models/dit/__pycache__/audio_dit.cpython-310.pyc b/models/dit/__pycache__/audio_dit.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dd57841061f81c6d99ffa668b6c2f6b84a7be942 Binary files /dev/null and b/models/dit/__pycache__/audio_dit.cpython-310.pyc differ diff --git a/models/dit/__pycache__/mask_dit.cpython-310.pyc b/models/dit/__pycache__/mask_dit.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d522a6de45cf7a054a70df04fa1a653d29b8067d Binary files /dev/null and b/models/dit/__pycache__/mask_dit.cpython-310.pyc differ diff --git a/models/dit/__pycache__/modules.cpython-310.pyc b/models/dit/__pycache__/modules.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d3c5c37bc20a567a4591c402a33e25e7a439568e Binary files /dev/null and b/models/dit/__pycache__/modules.cpython-310.pyc differ diff --git a/models/dit/__pycache__/rotary.cpython-310.pyc b/models/dit/__pycache__/rotary.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ed908b11418e29881c19d938b50fdd353a56e18c Binary files /dev/null and b/models/dit/__pycache__/rotary.cpython-310.pyc differ diff --git a/models/dit/__pycache__/span_mask.cpython-310.pyc b/models/dit/__pycache__/span_mask.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..50398017bd403c537104d81ceac0cb16d3b61586 Binary files /dev/null and b/models/dit/__pycache__/span_mask.cpython-310.pyc differ diff --git a/models/dit/attention.py b/models/dit/attention.py new file mode 100644 index 0000000000000000000000000000000000000000..2d6a665c87fc8b17a59b9d03cf325c273df50392 --- /dev/null +++ b/models/dit/attention.py @@ -0,0 +1,350 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.utils.checkpoint +import einops +from einops import rearrange, repeat +from inspect import isfunction +from .rotary import RotaryEmbedding +from .modules import RMSNorm + +if hasattr(nn.functional, 'scaled_dot_product_attention'): + ATTENTION_MODE = 'flash' +else: + ATTENTION_MODE = 'math' +print(f'attention mode is {ATTENTION_MODE}') + + +def add_mask(sim, mask): + b, ndim = sim.shape[0], mask.ndim + if ndim == 3: + mask = rearrange(mask, "b n m -> b 1 n m") + if ndim == 2: + mask = repeat(mask, "n m -> b 1 n m", b=b) + max_neg_value = -torch.finfo(sim.dtype).max + sim = sim.masked_fill(~mask, max_neg_value) + return sim + + +def create_mask(q_shape, k_shape, device, q_mask=None, k_mask=None): + def default(val, d): + return val if val is not None else (d() if isfunction(d) else d) + + b, i, j, device = q_shape[0], q_shape[-2], k_shape[-2], device + #print(q_mask) + q_mask = default( + q_mask, torch.ones((b, i), device=device, dtype=torch.bool) + ) + k_mask = default( + k_mask, torch.ones((b, j), device=device, dtype=torch.bool) + ) + attn_mask = rearrange(q_mask, 'b i -> b 1 i 1' + ) * rearrange(k_mask, 'b j -> b 1 1 j') + return attn_mask + + +class Attention(nn.Module): + def __init__( + self, + dim, + context_dim=None, + num_heads=8, + qkv_bias=False, + qk_scale=None, + qk_norm=None, + attn_drop=0., + proj_drop=0., + rope_mode='none' + ): + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = qk_scale or head_dim**-0.5 + + if context_dim is None: + self.cross_attn = False + else: + self.cross_attn = True + + context_dim = dim if context_dim is None else context_dim + + self.to_q = nn.Linear(dim, dim, bias=qkv_bias) + self.to_k = nn.Linear(context_dim, dim, bias=qkv_bias) + self.to_v = nn.Linear(context_dim, dim, bias=qkv_bias) + + if qk_norm is None: + self.norm_q = nn.Identity() + self.norm_k = nn.Identity() + elif qk_norm == 'layernorm': + self.norm_q = nn.LayerNorm(head_dim) + self.norm_k = nn.LayerNorm(head_dim) + elif qk_norm == 'rmsnorm': + self.norm_q = RMSNorm(head_dim) + self.norm_k = RMSNorm(head_dim) + else: + raise NotImplementedError + + self.attn_drop_p = attn_drop + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + if self.cross_attn: + assert rope_mode == 'none' + self.rope_mode = rope_mode + if self.rope_mode == 'shared' or self.rope_mode == 'x_only': + self.rotary = RotaryEmbedding(dim=head_dim) + elif self.rope_mode == 'dual': + self.rotary_x = RotaryEmbedding(dim=head_dim) + self.rotary_c = RotaryEmbedding(dim=head_dim) + + def _rotary(self, q, k, extras): + if self.rope_mode == 'shared': + q, k = self.rotary(q=q, k=k) + elif self.rope_mode == 'x_only': + q_x, k_x = self.rotary( + q=q[:, :, extras:, :], k=k[:, :, extras:, :] + ) + q_c, k_c = q[:, :, :extras, :], k[:, :, :extras, :] + q = torch.cat((q_c, q_x), dim=2) + k = torch.cat((k_c, k_x), dim=2) + elif self.rope_mode == 'dual': + q_x, k_x = self.rotary_x( + q=q[:, :, extras:, :], k=k[:, :, extras:, :] + ) + q_c, k_c = self.rotary_c( + q=q[:, :, :extras, :], k=k[:, :, :extras, :] + ) + q = torch.cat((q_c, q_x), dim=2) + k = torch.cat((k_c, k_x), dim=2) + elif self.rope_mode == 'none': + pass + else: + raise NotImplementedError + return q, k + + def _attn(self, q, k, v, mask_binary): + if ATTENTION_MODE == 'flash': + x = F.scaled_dot_product_attention( + q, k, v, dropout_p=self.attn_drop_p, attn_mask=mask_binary + ) + x = einops.rearrange(x, 'B H L D -> B L (H D)') + elif ATTENTION_MODE == 'math': + attn = (q @ k.transpose(-2, -1)) * self.scale + attn = add_mask( + attn, mask_binary + ) if mask_binary is not None else attn + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + x = (attn @ v).transpose(1, 2) + x = einops.rearrange(x, 'B H L D -> B L (H D)') + else: + raise NotImplementedError + return x + + def forward(self, x, context=None, context_mask=None, extras=0): + B, L, C = x.shape + if context is None: + context = x + + q = self.to_q(x) + k = self.to_k(context) + v = self.to_v(context) + + if context_mask is not None: + mask_binary = create_mask( + x.shape, context.shape, x.device, None, context_mask + ) + else: + mask_binary = None + + q = einops.rearrange(q, 'B L (H D) -> B H L D', H=self.num_heads) + k = einops.rearrange(k, 'B L (H D) -> B H L D', H=self.num_heads) + v = einops.rearrange(v, 'B L (H D) -> B H L D', H=self.num_heads) + + q = self.norm_q(q) + k = self.norm_k(k) + + q, k = self._rotary(q, k, extras) + + x = self._attn(q, k, v, mask_binary) + + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class JointAttention(nn.Module): + def __init__( + self, + dim, + num_heads=8, + qkv_bias=False, + qk_scale=None, + qk_norm=None, + attn_drop=0., + proj_drop=0., + rope_mode='none' + ): + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = qk_scale or head_dim**-0.5 + + self.to_qx, self.to_kx, self.to_vx = self._make_qkv_layers( + dim, qkv_bias + ) + self.to_qc, self.to_kc, self.to_vc = self._make_qkv_layers( + dim, qkv_bias + ) + + self.norm_qx, self.norm_kx = self._make_norm_layers(qk_norm, head_dim) + self.norm_qc, self.norm_kc = self._make_norm_layers(qk_norm, head_dim) + + self.attn_drop_p = attn_drop + self.attn_drop = nn.Dropout(attn_drop) + + self.proj_x = nn.Linear(dim, dim) + self.proj_drop_x = nn.Dropout(proj_drop) + + self.proj_c = nn.Linear(dim, dim) + self.proj_drop_c = nn.Dropout(proj_drop) + + self.rope_mode = rope_mode + if self.rope_mode == 'shared' or self.rope_mode == 'x_only': + self.rotary = RotaryEmbedding(dim=head_dim) + elif self.rope_mode == 'dual': + self.rotary_x = RotaryEmbedding(dim=head_dim) + self.rotary_c = RotaryEmbedding(dim=head_dim) + + def _make_qkv_layers(self, dim, qkv_bias): + return ( + nn.Linear(dim, dim, + bias=qkv_bias), nn.Linear(dim, dim, bias=qkv_bias), + nn.Linear(dim, dim, bias=qkv_bias) + ) + + def _make_norm_layers(self, qk_norm, head_dim): + if qk_norm is None: + norm_q = nn.Identity() + norm_k = nn.Identity() + elif qk_norm == 'layernorm': + norm_q = nn.LayerNorm(head_dim) + norm_k = nn.LayerNorm(head_dim) + elif qk_norm == 'rmsnorm': + norm_q = RMSNorm(head_dim) + norm_k = RMSNorm(head_dim) + else: + raise NotImplementedError + return norm_q, norm_k + + def _rotary(self, q, k, extras): + if self.rope_mode == 'shared': + q, k = self.rotary(q=q, k=k) + elif self.rope_mode == 'x_only': + q_x, k_x = self.rotary( + q=q[:, :, extras:, :], k=k[:, :, extras:, :] + ) + q_c, k_c = q[:, :, :extras, :], k[:, :, :extras, :] + q = torch.cat((q_c, q_x), dim=2) + k = torch.cat((k_c, k_x), dim=2) + elif self.rope_mode == 'dual': + q_x, k_x = self.rotary_x( + q=q[:, :, extras:, :], k=k[:, :, extras:, :] + ) + q_c, k_c = self.rotary_c( + q=q[:, :, :extras, :], k=k[:, :, :extras, :] + ) + q = torch.cat((q_c, q_x), dim=2) + k = torch.cat((k_c, k_x), dim=2) + elif self.rope_mode == 'none': + pass + else: + raise NotImplementedError + return q, k + + def _attn(self, q, k, v, mask_binary): + if ATTENTION_MODE == 'flash': + x = F.scaled_dot_product_attention( + q, k, v, dropout_p=self.attn_drop_p, attn_mask=mask_binary + ) + x = einops.rearrange(x, 'B H L D -> B L (H D)') + elif ATTENTION_MODE == 'math': + attn = (q @ k.transpose(-2, -1)) * self.scale + attn = add_mask( + attn, mask_binary + ) if mask_binary is not None else attn + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + x = (attn @ v).transpose(1, 2) + x = einops.rearrange(x, 'B H L D -> B L (H D)') + else: + raise NotImplementedError + return x + + def _cat_mask(self, x, context, x_mask=None, context_mask=None): + B = x.shape[0] + if x_mask is None: + x_mask = torch.ones(B, x.shape[-2], device=x.device).bool() + if context_mask is None: + context_mask = torch.ones( + B, context.shape[-2], device=context.device + ).bool() + mask = torch.cat([context_mask, x_mask], dim=1) + return mask + + def forward(self, x, context, x_mask=None, context_mask=None, extras=0): + B, Lx, C = x.shape + _, Lc, _ = context.shape + if x_mask is not None or context_mask is not None: + mask = self._cat_mask( + x, context, x_mask=x_mask, context_mask=context_mask + ) + shape = [B, Lx + Lc, C] + mask_binary = create_mask( + q_shape=shape, + k_shape=shape, + device=x.device, + q_mask=None, + k_mask=mask + ) + else: + mask_binary = None + + qx, kx, vx = self.to_qx(x), self.to_kx(x), self.to_vx(x) + qc, kc, vc = self.to_qc(context), self.to_kc(context + ), self.to_vc(context) + + qx, kx, vx = map( + lambda t: einops. + rearrange(t, 'B L (H D) -> B H L D', H=self.num_heads), + [qx, kx, vx] + ) + qc, kc, vc = map( + lambda t: einops. + rearrange(t, 'B L (H D) -> B H L D', H=self.num_heads), + [qc, kc, vc] + ) + + qx, kx = self.norm_qx(qx), self.norm_kx(kx) + qc, kc = self.norm_qc(qc), self.norm_kc(kc) + + q, k, v = ( + torch.cat([qc, qx], + dim=2), torch.cat([kc, kx], + dim=2), torch.cat([vc, vx], dim=2) + ) + + q, k = self._rotary(q, k, extras) + + x = self._attn(q, k, v, mask_binary) + + context, x = x[:, :Lc, :], x[:, Lc:, :] + + x = self.proj_x(x) + x = self.proj_drop_x(x) + + context = self.proj_c(context) + context = self.proj_drop_c(context) + + return x, context diff --git a/models/dit/audio_diffsingernet_dit.py b/models/dit/audio_diffsingernet_dit.py new file mode 100644 index 0000000000000000000000000000000000000000..9a5facb5c2316a04f0010a477ce0b6d7268d043a --- /dev/null +++ b/models/dit/audio_diffsingernet_dit.py @@ -0,0 +1,520 @@ +import torch +import torch.nn as nn +from torch.utils.checkpoint import checkpoint + +from .mask_dit import DiTBlock, FinalBlock, UDiT +from .modules import ( + film_modulate, + PatchEmbed, + PE_wrapper, + TimestepEmbedder, + RMSNorm, +) + + +class AudioDiTBlock(DiTBlock): + """ + A modified DiT block with time_aligned_context add to latent. + """ + def __init__( + self, + dim, + time_aligned_context_dim, + dilation, + context_dim=None, + num_heads=8, + mlp_ratio=4., + qkv_bias=False, + qk_scale=None, + qk_norm=None, + act_layer='gelu', + norm_layer=nn.LayerNorm, + time_fusion='none', + ada_sola_rank=None, + ada_sola_alpha=None, + skip=False, + skip_norm=False, + rope_mode='none', + context_norm=False, + use_checkpoint=False + ): + super().__init__( + dim=dim, + context_dim=context_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + qk_norm=qk_norm, + act_layer=act_layer, + norm_layer=norm_layer, + time_fusion=time_fusion, + ada_sola_rank=ada_sola_rank, + ada_sola_alpha=ada_sola_alpha, + skip=skip, + skip_norm=skip_norm, + rope_mode=rope_mode, + context_norm=context_norm, + use_checkpoint=use_checkpoint + ) + # time-aligned context projection + self.ta_context_projection = nn.Linear( + time_aligned_context_dim, 2 * dim + ) + self.dilated_conv = nn.Conv1d( + dim, 2 * dim, kernel_size=3, padding=dilation, dilation=dilation + ) + + def forward( + self, + x, + time_aligned_context, + time_token=None, + time_ada=None, + skip=None, + context=None, + x_mask=None, + context_mask=None, + extras=None + ): + if self.use_checkpoint: + return checkpoint( + self._forward, + x, + time_aligned_context, + time_token, + time_ada, + skip, + context, + x_mask, + context_mask, + extras, + use_reentrant=False + ) + else: + return self._forward( + x, + time_aligned_context, + time_token, + time_ada, + skip, + context, + x_mask, + context_mask, + extras, + ) + + def _forward( + self, + x, + time_aligned_context, + time_token=None, + time_ada=None, + skip=None, + context=None, + x_mask=None, + context_mask=None, + extras=None + ): + B, T, C = x.shape + if self.skip_linear is not None: + assert skip is not None + cat = torch.cat([x, skip], dim=-1) + cat = self.skip_norm(cat) + x = self.skip_linear(cat) + + if self.use_adanorm: + time_ada = self.adaln(time_token, time_ada) + (shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, + gate_mlp) = time_ada.chunk(6, dim=1) + + # self attention + if self.use_adanorm: + x_norm = film_modulate( + self.norm1(x), shift=shift_msa, scale=scale_msa + ) + x = x + (1-gate_msa) * self.attn( + x_norm, context=None, context_mask=x_mask, extras=extras + ) + else: + # TODO diffusion timestep input is not fused here + x = x + self.attn( + self.norm1(x), + context=None, + context_mask=x_mask, + extras=extras + ) + + # time-aligned context + time_aligned_context = self.ta_context_projection(time_aligned_context) + x = self.dilated_conv(x.transpose(1, 2) + ).transpose(1, 2) + time_aligned_context + + gate, filter = torch.chunk(x, 2, dim=-1) + x = torch.sigmoid(gate) * torch.tanh(filter) + + # cross attention + if self.use_context: + assert context is not None + x = x + self.cross_attn( + x=self.norm2(x), + context=self.norm_context(context), + context_mask=context_mask, + extras=extras + ) + + # mlp + if self.use_adanorm: + x_norm = film_modulate( + self.norm3(x), shift=shift_mlp, scale=scale_mlp + ) + x = x + (1-gate_mlp) * self.mlp(x_norm) + else: + x = x + self.mlp(self.norm3(x)) + + return x + + +class AudioUDiT(UDiT): + def __init__( + self, + img_size=224, + patch_size=16, + in_chans=3, + input_type='2d', + out_chans=None, + embed_dim=768, + depth=12, + dilation_cycle_length=4, + num_heads=12, + mlp_ratio=4, + qkv_bias=False, + qk_scale=None, + qk_norm=None, + act_layer='gelu', + norm_layer='layernorm', + context_norm=False, + use_checkpoint=False, + time_fusion='token', + ada_sola_rank=None, + ada_sola_alpha=None, + cls_dim=None, + time_aligned_context_dim=768, + context_dim=768, + context_fusion='concat', + context_max_length=128, + context_pe_method='sinu', + pe_method='abs', + rope_mode='none', + use_conv=True, + skip=True, + skip_norm=True + ): + nn.Module.__init__(self) + self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models + + # input + self.in_chans = in_chans + self.input_type = input_type + if self.input_type == '2d': + num_patches = (img_size[0] // + patch_size) * (img_size[1] // patch_size) + elif self.input_type == '1d': + num_patches = img_size // patch_size + self.patch_embed = PatchEmbed( + patch_size=patch_size, + in_chans=in_chans, + embed_dim=embed_dim, + input_type=input_type + ) + out_chans = in_chans if out_chans is None else out_chans + self.out_chans = out_chans + + # position embedding + self.rope = rope_mode + self.x_pe = PE_wrapper( + dim=embed_dim, method=pe_method, length=num_patches + ) + + # time embed + self.time_embed = TimestepEmbedder(embed_dim) + self.time_fusion = time_fusion + self.use_adanorm = False + + # cls embed + if cls_dim is not None: + self.cls_embed = nn.Sequential( + nn.Linear(cls_dim, embed_dim, bias=True), + nn.SiLU(), + nn.Linear(embed_dim, embed_dim, bias=True), + ) + else: + self.cls_embed = None + + # time fusion + if time_fusion == 'token': + # put token at the beginning of sequence + self.extras = 2 if self.cls_embed else 1 + self.time_pe = PE_wrapper( + dim=embed_dim, method='abs', length=self.extras + ) + elif time_fusion in ['ada', 'ada_single', 'ada_sola', 'ada_sola_bias']: + self.use_adanorm = True + # aviod repetitive silu for each adaln block + self.time_act = nn.SiLU() + self.extras = 0 + self.time_ada_final = nn.Linear( + embed_dim, 2 * embed_dim, bias=True + ) + if time_fusion in ['ada_single', 'ada_sola', 'ada_sola_bias']: + # shared adaln + self.time_ada = nn.Linear(embed_dim, 6 * embed_dim, bias=True) + else: + self.time_ada = None + else: + raise NotImplementedError + + # context + # use a simple projection + self.use_context = False + self.context_cross = False + self.context_max_length = context_max_length + self.context_fusion = 'none' + if context_dim is not None: + self.use_context = True + self.context_embed = nn.Sequential( + nn.Linear(context_dim, embed_dim, bias=True), + nn.SiLU(), + nn.Linear(embed_dim, embed_dim, bias=True), + ) + self.context_fusion = context_fusion + if context_fusion == 'concat' or context_fusion == 'joint': + self.extras += context_max_length + self.context_pe = PE_wrapper( + dim=embed_dim, + method=context_pe_method, + length=context_max_length + ) + # no cross attention layers + context_dim = None + elif context_fusion == 'cross': + self.context_pe = PE_wrapper( + dim=embed_dim, + method=context_pe_method, + length=context_max_length + ) + self.context_cross = True + context_dim = embed_dim + else: + raise NotImplementedError + + self.use_skip = skip + + # norm layers + if norm_layer == 'layernorm': + norm_layer = nn.LayerNorm + elif norm_layer == 'rmsnorm': + norm_layer = RMSNorm + else: + raise NotImplementedError + + self.in_blocks = nn.ModuleList([ + AudioDiTBlock( + dim=embed_dim, + time_aligned_context_dim=time_aligned_context_dim, + dilation=2**(i % dilation_cycle_length), + context_dim=context_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + qk_norm=qk_norm, + act_layer=act_layer, + norm_layer=norm_layer, + time_fusion=time_fusion, + ada_sola_rank=ada_sola_rank, + ada_sola_alpha=ada_sola_alpha, + skip=False, + skip_norm=False, + rope_mode=self.rope, + context_norm=context_norm, + use_checkpoint=use_checkpoint + ) for i in range(depth // 2) + ]) + + self.mid_block = AudioDiTBlock( + dim=embed_dim, + time_aligned_context_dim=time_aligned_context_dim, + dilation=1, + context_dim=context_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + qk_norm=qk_norm, + act_layer=act_layer, + norm_layer=norm_layer, + time_fusion=time_fusion, + ada_sola_rank=ada_sola_rank, + ada_sola_alpha=ada_sola_alpha, + skip=False, + skip_norm=False, + rope_mode=self.rope, + context_norm=context_norm, + use_checkpoint=use_checkpoint + ) + + self.out_blocks = nn.ModuleList([ + AudioDiTBlock( + dim=embed_dim, + time_aligned_context_dim=time_aligned_context_dim, + dilation=2**(i % dilation_cycle_length), + context_dim=context_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + qk_norm=qk_norm, + act_layer=act_layer, + norm_layer=norm_layer, + time_fusion=time_fusion, + ada_sola_rank=ada_sola_rank, + ada_sola_alpha=ada_sola_alpha, + skip=skip, + skip_norm=skip_norm, + rope_mode=self.rope, + context_norm=context_norm, + use_checkpoint=use_checkpoint + ) for i in range(depth // 2) + ]) + + # FinalLayer block + self.use_conv = use_conv + self.final_block = FinalBlock( + embed_dim=embed_dim, + patch_size=patch_size, + img_size=img_size, + in_chans=out_chans, + input_type=input_type, + norm_layer=norm_layer, + use_conv=use_conv, + use_adanorm=self.use_adanorm + ) + self.initialize_weights() + + def forward( + self, + x, + timesteps, + time_aligned_context, + context, + x_mask=None, + context_mask=None, + cls_token=None, + controlnet_skips=None, + ): + # make it compatible with int time step during inference + if timesteps.dim() == 0: + timesteps = timesteps.expand(x.shape[0] + ).to(x.device, dtype=torch.long) + + x = self.patch_embed(x) + x = self.x_pe(x) + + B, L, D = x.shape + + if self.use_context: + context_token = self.context_embed(context) + context_token = self.context_pe(context_token) + if self.context_fusion == 'concat' or self.context_fusion == 'joint': + x, x_mask = self._concat_x_context( + x=x, + context=context_token, + x_mask=x_mask, + context_mask=context_mask + ) + context_token, context_mask = None, None + else: + context_token, context_mask = None, None + + time_token = self.time_embed(timesteps) + if self.cls_embed: + cls_token = self.cls_embed(cls_token) + time_ada = None + time_ada_final = None + if self.use_adanorm: + if self.cls_embed: + time_token = time_token + cls_token + time_token = self.time_act(time_token) + time_ada_final = self.time_ada_final(time_token) + if self.time_ada is not None: + time_ada = self.time_ada(time_token) + else: + time_token = time_token.unsqueeze(dim=1) + if self.cls_embed: + cls_token = cls_token.unsqueeze(dim=1) + time_token = torch.cat([time_token, cls_token], dim=1) + time_token = self.time_pe(time_token) + x = torch.cat((time_token, x), dim=1) + if x_mask is not None: + x_mask = torch.cat([ + torch.ones(B, time_token.shape[1], + device=x_mask.device).bool(), x_mask + ], + dim=1) + time_token = None + + skips = [] + for blk in self.in_blocks: + x = blk( + x=x, + time_aligned_context=time_aligned_context, + time_token=time_token, + time_ada=time_ada, + skip=None, + context=context_token, + x_mask=x_mask, + context_mask=context_mask, + extras=self.extras + ) + if self.use_skip: + skips.append(x) + + x = self.mid_block( + x=x, + time_aligned_context=time_aligned_context, + time_token=time_token, + time_ada=time_ada, + skip=None, + context=context_token, + x_mask=x_mask, + context_mask=context_mask, + extras=self.extras + ) + for blk in self.out_blocks: + if self.use_skip: + skip = skips.pop() + if controlnet_skips: + # add to skip like u-net controlnet + skip = skip + controlnet_skips.pop() + else: + skip = None + if controlnet_skips: + # directly add to x + x = x + controlnet_skips.pop() + + x = blk( + x=x, + time_aligned_context=time_aligned_context, + time_token=time_token, + time_ada=time_ada, + skip=skip, + context=context_token, + x_mask=x_mask, + context_mask=context_mask, + extras=self.extras + ) + + x = self.final_block(x, time_ada=time_ada_final, extras=self.extras) + + return x diff --git a/models/dit/audio_dit.py b/models/dit/audio_dit.py new file mode 100644 index 0000000000000000000000000000000000000000..643e8e82d7c44796ad6c04e109102199ead6b246 --- /dev/null +++ b/models/dit/audio_dit.py @@ -0,0 +1,549 @@ +import torch +import torch.nn as nn +from torch.utils.checkpoint import checkpoint + +from .mask_dit import DiTBlock, FinalBlock, UDiT +from .modules import ( + film_modulate, + PatchEmbed, + PE_wrapper, + TimestepEmbedder, + RMSNorm, +) + + +class AudioDiTBlock(DiTBlock): + """ + A modified DiT block with time aligned context add to latent. + """ + def __init__( + self, + dim, + ta_context_dim, + ta_context_norm=False, + context_dim=None, + num_heads=8, + mlp_ratio=4., + qkv_bias=False, + qk_scale=None, + qk_norm=None, + act_layer='gelu', + norm_layer=nn.LayerNorm, + ta_context_fusion='add', + time_fusion='none', + ada_sola_rank=None, + ada_sola_alpha=None, + skip=False, + skip_norm=False, + rope_mode='none', + context_norm=False, + use_checkpoint=False + ): + super().__init__( + dim=dim, + context_dim=context_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + qk_norm=qk_norm, + act_layer=act_layer, + norm_layer=norm_layer, + time_fusion=time_fusion, + ada_sola_rank=ada_sola_rank, + ada_sola_alpha=ada_sola_alpha, + skip=skip, + skip_norm=skip_norm, + rope_mode=rope_mode, + context_norm=context_norm, + use_checkpoint=use_checkpoint + ) + self.ta_context_fusion = ta_context_fusion + self.ta_context_norm = ta_context_norm + if self.ta_context_fusion == "add": + self.ta_context_projection = nn.Linear(ta_context_dim, dim) + self.ta_context_norm = norm_layer( + ta_context_dim + ) if self.ta_context_norm else nn.Identity() + elif self.ta_context_fusion == "concat": + self.ta_context_projection = nn.Linear(ta_context_dim + dim, dim) + self.ta_context_norm = norm_layer( + ta_context_dim + dim + ) if self.ta_context_norm else nn.Identity() + + def forward( + self, + x, + time_aligned_context, + time_token=None, + time_ada=None, + skip=None, + context=None, + x_mask=None, + context_mask=None, + extras=None + ): + if self.use_checkpoint: + return checkpoint( + self._forward, + x, + time_aligned_context, + time_token, + time_ada, + skip, + context, + x_mask, + context_mask, + extras, + use_reentrant=False + ) + else: + return self._forward( + x, + time_aligned_context, + time_token, + time_ada, + skip, + context, + x_mask, + context_mask, + extras, + ) + + def _forward( + self, + x, + time_aligned_context, + time_token=None, + time_ada=None, + skip=None, + context=None, + x_mask=None, + context_mask=None, + extras=None + ): + B, T, C = x.shape + + # # time aligned context + # if self.ta_context_fusion == "add": + # time_aligned_context = self.ta_context_projection( + # self.ta_context_norm(time_aligned_context) + # ) + # x = x + time_aligned_context + # elif self.ta_context_fusion == "concat": + # cat = torch.cat([x, time_aligned_context], dim=-1) + # cat = self.ta_context_norm(cat) + # x = self.ta_context_projection(cat) + + # skip connection + if self.skip_linear is not None: + assert skip is not None + cat = torch.cat([x, skip], dim=-1) + cat = self.skip_norm(cat) + x = self.skip_linear(cat) + #print('skip') + #print(x) + if self.use_adanorm: + time_ada = self.adaln(time_token, time_ada) + (shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, + gate_mlp) = time_ada.chunk(6, dim=1) + + # self attention + if self.use_adanorm: + x_norm = film_modulate( + self.norm1(x), shift=shift_msa, scale=scale_msa + ) + x = x + (1-gate_msa) * self.attn( + x_norm, context=None, context_mask=x_mask, extras=extras + ) + else: + # TODO diffusion timestep input is not fused here + x = x + self.attn( + self.norm1(x), + context=None, + context_mask=x_mask, + extras=extras + ) + + # time aligned context fusion + if self.ta_context_fusion == "add": + time_aligned_context = self.ta_context_projection( + self.ta_context_norm(time_aligned_context) + ) + x = x + time_aligned_context + elif self.ta_context_fusion == "concat": + cat = torch.cat([x, time_aligned_context], dim=-1) + cat = self.ta_context_norm(cat) + x = self.ta_context_projection(cat) + + # cross attention + if self.use_context: + assert context is not None + x = x + self.cross_attn( + x=self.norm2(x), + context=self.norm_context(context), + context_mask=context_mask, + extras=extras + ) + + # mlp + if self.use_adanorm: + x_norm = film_modulate( + self.norm3(x), shift=shift_mlp, scale=scale_mlp + ) + x = x + (1-gate_mlp) * self.mlp(x_norm) + else: + x = x + self.mlp(self.norm3(x)) + + return x + + +class AudioUDiT(UDiT): + def __init__( + self, + img_size=224, + patch_size=16, + in_chans=3, + input_type='2d', + out_chans=None, + embed_dim=768, + depth=12, + num_heads=12, + mlp_ratio=4, + qkv_bias=False, + qk_scale=None, + qk_norm=None, + act_layer='gelu', + norm_layer='layernorm', + context_norm=False, + use_checkpoint=False, + time_fusion='token', + ada_sola_rank=None, + ada_sola_alpha=None, + cls_dim=None, + ta_context_dim=768, + ta_context_fusion='concat', + ta_context_norm=True, + context_dim=768, + context_fusion='concat', + context_max_length=128, + context_pe_method='sinu', + pe_method='abs', + rope_mode='none', + use_conv=True, + skip=True, + skip_norm=True + ): + nn.Module.__init__(self) + self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models + + # input + self.in_chans = in_chans + self.input_type = input_type + if self.input_type == '2d': + num_patches = (img_size[0] // + patch_size) * (img_size[1] // patch_size) + elif self.input_type == '1d': + num_patches = img_size // patch_size + self.patch_embed = PatchEmbed( + patch_size=patch_size, + in_chans=in_chans, + embed_dim=embed_dim, + input_type=input_type + ) + out_chans = in_chans if out_chans is None else out_chans + self.out_chans = out_chans + + # position embedding + self.rope = rope_mode + self.x_pe = PE_wrapper( + dim=embed_dim, method=pe_method, length=num_patches + ) + + # time embed + self.time_embed = TimestepEmbedder(embed_dim) + self.time_fusion = time_fusion + self.use_adanorm = False + + # cls embed + if cls_dim is not None: + self.cls_embed = nn.Sequential( + nn.Linear(cls_dim, embed_dim, bias=True), + nn.SiLU(), + nn.Linear(embed_dim, embed_dim, bias=True), + ) + else: + self.cls_embed = None + + # time fusion + if time_fusion == 'token': + # put token at the beginning of sequence + self.extras = 2 if self.cls_embed else 1 + self.time_pe = PE_wrapper( + dim=embed_dim, method='abs', length=self.extras + ) + elif time_fusion in ['ada', 'ada_single', 'ada_sola', 'ada_sola_bias']: + self.use_adanorm = True + # aviod repetitive silu for each adaln block + self.time_act = nn.SiLU() + self.extras = 0 + self.time_ada_final = nn.Linear( + embed_dim, 2 * embed_dim, bias=True + ) + if time_fusion in ['ada_single', 'ada_sola', 'ada_sola_bias']: + # shared adaln + self.time_ada = nn.Linear(embed_dim, 6 * embed_dim, bias=True) + else: + self.time_ada = None + else: + raise NotImplementedError + + # context + # use a simple projection + self.use_context = False + self.context_cross = False + self.context_max_length = context_max_length + self.context_fusion = 'none' + if context_dim is not None: + self.use_context = True + self.context_embed = nn.Sequential( + nn.Linear(context_dim, embed_dim, bias=True), + nn.SiLU(), + nn.Linear(embed_dim, embed_dim, bias=True), + ) + self.context_fusion = context_fusion + if context_fusion == 'concat' or context_fusion == 'joint': + self.extras += context_max_length + self.context_pe = PE_wrapper( + dim=embed_dim, + method=context_pe_method, + length=context_max_length + ) + # no cross attention layers + context_dim = None + elif context_fusion == 'cross': + self.context_pe = PE_wrapper( + dim=embed_dim, + method=context_pe_method, + length=context_max_length + ) + self.context_cross = True + context_dim = embed_dim + else: + raise NotImplementedError + + self.use_skip = skip + + # norm layers + if norm_layer == 'layernorm': + norm_layer = nn.LayerNorm + elif norm_layer == 'rmsnorm': + norm_layer = RMSNorm + else: + raise NotImplementedError + + self.in_blocks = nn.ModuleList([ + AudioDiTBlock( + dim=embed_dim, + ta_context_dim=ta_context_dim, + ta_context_fusion=ta_context_fusion, + ta_context_norm=ta_context_norm, + context_dim=context_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + qk_norm=qk_norm, + act_layer=act_layer, + norm_layer=norm_layer, + time_fusion=time_fusion, + ada_sola_rank=ada_sola_rank, + ada_sola_alpha=ada_sola_alpha, + skip=False, + skip_norm=False, + rope_mode=self.rope, + context_norm=context_norm, + use_checkpoint=use_checkpoint + ) for i in range(depth // 2) + ]) + + self.mid_block = AudioDiTBlock( + dim=embed_dim, + ta_context_dim=ta_context_dim, + context_dim=context_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + qk_norm=qk_norm, + act_layer=act_layer, + norm_layer=norm_layer, + time_fusion=time_fusion, + ada_sola_rank=ada_sola_rank, + ada_sola_alpha=ada_sola_alpha, + ta_context_fusion=ta_context_fusion, + ta_context_norm=ta_context_norm, + skip=False, + skip_norm=False, + rope_mode=self.rope, + context_norm=context_norm, + use_checkpoint=use_checkpoint + ) + + self.out_blocks = nn.ModuleList([ + AudioDiTBlock( + dim=embed_dim, + ta_context_dim=ta_context_dim, + context_dim=context_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + qk_norm=qk_norm, + act_layer=act_layer, + norm_layer=norm_layer, + time_fusion=time_fusion, + ada_sola_rank=ada_sola_rank, + ada_sola_alpha=ada_sola_alpha, + ta_context_fusion=ta_context_fusion, + ta_context_norm=ta_context_norm, + skip=skip, + skip_norm=skip_norm, + rope_mode=self.rope, + context_norm=context_norm, + use_checkpoint=use_checkpoint + ) for i in range(depth // 2) + ]) + + # FinalLayer block + self.use_conv = use_conv + self.final_block = FinalBlock( + embed_dim=embed_dim, + patch_size=patch_size, + img_size=img_size, + in_chans=out_chans, + input_type=input_type, + norm_layer=norm_layer, + use_conv=use_conv, + use_adanorm=self.use_adanorm + ) + self.initialize_weights() + + def forward( + self, + x, + timesteps, + time_aligned_context, + context, + x_mask=None, + context_mask=None, + cls_token=None, + controlnet_skips=None, + ): + # make it compatible with int time step during inference + if timesteps.dim() == 0: + timesteps = timesteps.expand(x.shape[0] + ).to(x.device, dtype=torch.long) + + x = self.patch_embed(x) + x = self.x_pe(x) + + B, L, D = x.shape + + if self.use_context: + context_token = self.context_embed(context) + context_token = self.context_pe(context_token) + if self.context_fusion == 'concat' or self.context_fusion == 'joint': + x, x_mask = self._concat_x_context( + x=x, + context=context_token, + x_mask=x_mask, + context_mask=context_mask + ) + context_token, context_mask = None, None + else: + context_token, context_mask = None, None + + time_token = self.time_embed(timesteps) + if self.cls_embed: + cls_token = self.cls_embed(cls_token) + time_ada = None + time_ada_final = None + if self.use_adanorm: + if self.cls_embed: + time_token = time_token + cls_token + time_token = self.time_act(time_token) + time_ada_final = self.time_ada_final(time_token) + if self.time_ada is not None: + time_ada = self.time_ada(time_token) + else: + time_token = time_token.unsqueeze(dim=1) + if self.cls_embed: + cls_token = cls_token.unsqueeze(dim=1) + time_token = torch.cat([time_token, cls_token], dim=1) + time_token = self.time_pe(time_token) + x = torch.cat((time_token, x), dim=1) + if x_mask is not None: + x_mask = torch.cat([ + torch.ones(B, time_token.shape[1], + device=x_mask.device).bool(), x_mask + ], + dim=1) + time_token = None + + skips = [] + for blk in self.in_blocks: + x = blk( + x=x, + time_aligned_context=time_aligned_context, + time_token=time_token, + time_ada=time_ada, + skip=None, + context=context_token, + x_mask=x_mask, + context_mask=context_mask, + extras=self.extras + ) + + if self.use_skip: + skips.append(x) + + x = self.mid_block( + x=x, + time_aligned_context=time_aligned_context, + time_token=time_token, + time_ada=time_ada, + skip=None, + context=context_token, + x_mask=x_mask, + context_mask=context_mask, + extras=self.extras + ) + + for blk in self.out_blocks: + if self.use_skip: + skip = skips.pop() + if controlnet_skips: + # add to skip like u-net controlnet + skip = skip + controlnet_skips.pop() + else: + skip = None + if controlnet_skips: + # directly add to x + x = x + controlnet_skips.pop() + + x = blk( + x=x, + time_aligned_context=time_aligned_context, + time_token=time_token, + time_ada=time_ada, + skip=skip, + context=context_token, + x_mask=x_mask, + context_mask=context_mask, + extras=self.extras + ) + + x = self.final_block(x, time_ada=time_ada_final, extras=self.extras) + + return x diff --git a/models/dit/mask_dit.py b/models/dit/mask_dit.py new file mode 100644 index 0000000000000000000000000000000000000000..949e7807e14cbbee6dae942a19a249465a5c5a91 --- /dev/null +++ b/models/dit/mask_dit.py @@ -0,0 +1,823 @@ +import logging +import math +import torch +import torch.nn as nn +from torch.utils.checkpoint import checkpoint + +from .modules import ( + film_modulate, + unpatchify, + PatchEmbed, + PE_wrapper, + TimestepEmbedder, + FeedForward, + RMSNorm, +) +from .span_mask import compute_mask_indices +from .attention import Attention + +logger = logging.Logger(__file__) + + +class AdaLN(nn.Module): + def __init__(self, dim, ada_mode='ada', r=None, alpha=None): + super().__init__() + self.ada_mode = ada_mode + self.scale_shift_table = None + if ada_mode == 'ada': + # move nn.silu outside + self.time_ada = nn.Linear(dim, 6 * dim, bias=True) + elif ada_mode == 'ada_single': + # adaln used in pixel-art alpha + self.scale_shift_table = nn.Parameter(torch.zeros(6, dim)) + elif ada_mode in ['ada_solo', 'ada_sola_bias']: + self.lora_a = nn.Linear(dim, r * 6, bias=False) + self.lora_b = nn.Linear(r * 6, dim * 6, bias=False) + self.scaling = alpha / r + if ada_mode == 'ada_sola_bias': + # take bias out for consistency + self.scale_shift_table = nn.Parameter(torch.zeros(6, dim)) + else: + raise NotImplementedError + + def forward(self, time_token=None, time_ada=None): + if self.ada_mode == 'ada': + assert time_ada is None + B = time_token.shape[0] + time_ada = self.time_ada(time_token).reshape(B, 6, -1) + elif self.ada_mode == 'ada_single': + B = time_ada.shape[0] + time_ada = time_ada.reshape(B, 6, -1) + time_ada = self.scale_shift_table[None] + time_ada + elif self.ada_mode in ['ada_sola', 'ada_sola_bias']: + B = time_ada.shape[0] + time_ada_lora = self.lora_b(self.lora_a(time_token)) * self.scaling + time_ada = time_ada + time_ada_lora + time_ada = time_ada.reshape(B, 6, -1) + if self.scale_shift_table is not None: + time_ada = self.scale_shift_table[None] + time_ada + else: + raise NotImplementedError + return time_ada + + +class DiTBlock(nn.Module): + """ + A modified PixArt block with adaptive layer norm (adaLN-single) conditioning. + """ + def __init__( + self, + dim, + context_dim=None, + num_heads=8, + mlp_ratio=4., + qkv_bias=False, + qk_scale=None, + qk_norm=None, + act_layer='gelu', + norm_layer=nn.LayerNorm, + time_fusion='none', + ada_sola_rank=None, + ada_sola_alpha=None, + skip=False, + skip_norm=False, + rope_mode='none', + context_norm=False, + use_checkpoint=False + ): + + super().__init__() + self.norm1 = norm_layer(dim) + self.attn = Attention( + dim=dim, + num_heads=num_heads, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + qk_norm=qk_norm, + rope_mode=rope_mode + ) + + if context_dim is not None: + self.use_context = True + self.cross_attn = Attention( + dim=dim, + num_heads=num_heads, + context_dim=context_dim, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + qk_norm=qk_norm, + rope_mode='none' + ) + self.norm2 = norm_layer(dim) + if context_norm: + self.norm_context = norm_layer(context_dim) + else: + self.norm_context = nn.Identity() + else: + self.use_context = False + + self.norm3 = norm_layer(dim) + self.mlp = FeedForward( + dim=dim, mult=mlp_ratio, activation_fn=act_layer, dropout=0 + ) + + self.use_adanorm = True if time_fusion != 'token' else False + if self.use_adanorm: + self.adaln = AdaLN( + dim, + ada_mode=time_fusion, + r=ada_sola_rank, + alpha=ada_sola_alpha + ) + if skip: + self.skip_norm = norm_layer(2 * + dim) if skip_norm else nn.Identity() + self.skip_linear = nn.Linear(2 * dim, dim) + else: + self.skip_linear = None + + self.use_checkpoint = use_checkpoint + + def forward( + self, + x, + time_token=None, + time_ada=None, + skip=None, + context=None, + x_mask=None, + context_mask=None, + extras=None + ): + if self.use_checkpoint: + return checkpoint( + self._forward, + x, + time_token, + time_ada, + skip, + context, + x_mask, + context_mask, + extras, + use_reentrant=False + ) + else: + return self._forward( + x, time_token, time_ada, skip, context, x_mask, context_mask, + extras + ) + + def _forward( + self, + x, + time_token=None, + time_ada=None, + skip=None, + context=None, + x_mask=None, + context_mask=None, + extras=None + ): + B, T, C = x.shape + if self.skip_linear is not None: + assert skip is not None + cat = torch.cat([x, skip], dim=-1) + cat = self.skip_norm(cat) + x = self.skip_linear(cat) + + if self.use_adanorm: + time_ada = self.adaln(time_token, time_ada) + (shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, + gate_mlp) = time_ada.chunk(6, dim=1) + + # self attention + if self.use_adanorm: + x_norm = film_modulate( + self.norm1(x), shift=shift_msa, scale=scale_msa + ) + x = x + (1-gate_msa) * self.attn( + x_norm, context=None, context_mask=x_mask, extras=extras + ) + else: + x = x + self.attn( + self.norm1(x), + context=None, + context_mask=x_mask, + extras=extras + ) + + # cross attention + if self.use_context: + assert context is not None + x = x + self.cross_attn( + x=self.norm2(x), + context=self.norm_context(context), + context_mask=context_mask, + extras=extras + ) + + # mlp + if self.use_adanorm: + x_norm = film_modulate( + self.norm3(x), shift=shift_mlp, scale=scale_mlp + ) + x = x + (1-gate_mlp) * self.mlp(x_norm) + else: + x = x + self.mlp(self.norm3(x)) + + return x + + +class FinalBlock(nn.Module): + def __init__( + self, + embed_dim, + patch_size, + in_chans, + img_size, + input_type='2d', + norm_layer=nn.LayerNorm, + use_conv=True, + use_adanorm=True + ): + super().__init__() + self.in_chans = in_chans + self.img_size = img_size + self.input_type = input_type + + self.norm = norm_layer(embed_dim) + if use_adanorm: + self.use_adanorm = True + else: + self.use_adanorm = False + + if input_type == '2d': + self.patch_dim = patch_size**2 * in_chans + self.linear = nn.Linear(embed_dim, self.patch_dim, bias=True) + if use_conv: + self.final_layer = nn.Conv2d( + self.in_chans, self.in_chans, 3, padding=1 + ) + else: + self.final_layer = nn.Identity() + + elif input_type == '1d': + self.patch_dim = patch_size * in_chans + self.linear = nn.Linear(embed_dim, self.patch_dim, bias=True) + if use_conv: + self.final_layer = nn.Conv1d( + self.in_chans, self.in_chans, 3, padding=1 + ) + else: + self.final_layer = nn.Identity() + + def forward(self, x, time_ada=None, extras=0): + B, T, C = x.shape + x = x[:, extras:, :] + # only handle generation target + if self.use_adanorm: + shift, scale = time_ada.reshape(B, 2, -1).chunk(2, dim=1) + x = film_modulate(self.norm(x), shift, scale) + else: + x = self.norm(x) + x = self.linear(x) + x = unpatchify(x, self.in_chans, self.input_type, self.img_size) + x = self.final_layer(x) + return x + + +class UDiT(nn.Module): + def __init__( + self, + img_size=224, + patch_size=16, + in_chans=3, + input_type='2d', + out_chans=None, + embed_dim=768, + depth=12, + num_heads=12, + mlp_ratio=4., + qkv_bias=False, + qk_scale=None, + qk_norm=None, + act_layer='gelu', + norm_layer='layernorm', + context_norm=False, + use_checkpoint=False, + # time fusion ada or token + time_fusion='token', + ada_sola_rank=None, + ada_sola_alpha=None, + cls_dim=None, + # max length is only used for concat + context_dim=768, + context_fusion='concat', + context_max_length=128, + context_pe_method='sinu', + pe_method='abs', + rope_mode='none', + use_conv=True, + skip=True, + skip_norm=True + ): + super().__init__() + self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models + + # input + self.in_chans = in_chans + self.input_type = input_type + if self.input_type == '2d': + num_patches = (img_size[0] // + patch_size) * (img_size[1] // patch_size) + elif self.input_type == '1d': + num_patches = img_size // patch_size + self.patch_embed = PatchEmbed( + patch_size=patch_size, + in_chans=in_chans, + embed_dim=embed_dim, + input_type=input_type + ) + out_chans = in_chans if out_chans is None else out_chans + self.out_chans = out_chans + + # position embedding + self.rope = rope_mode + self.x_pe = PE_wrapper( + dim=embed_dim, method=pe_method, length=num_patches + ) + + logger.info(f'x position embedding: {pe_method}') + logger.info(f'rope mode: {self.rope}') + + # time embed + self.time_embed = TimestepEmbedder(embed_dim) + self.time_fusion = time_fusion + self.use_adanorm = False + + # cls embed + if cls_dim is not None: + self.cls_embed = nn.Sequential( + nn.Linear(cls_dim, embed_dim, bias=True), + nn.SiLU(), + nn.Linear(embed_dim, embed_dim, bias=True), + ) + else: + self.cls_embed = None + + # time fusion + if time_fusion == 'token': + # put token at the beginning of sequence + self.extras = 2 if self.cls_embed else 1 + self.time_pe = PE_wrapper( + dim=embed_dim, method='abs', length=self.extras + ) + elif time_fusion in ['ada', 'ada_single', 'ada_sola', 'ada_sola_bias']: + self.use_adanorm = True + # aviod repetitive silu for each adaln block + self.time_act = nn.SiLU() + self.extras = 0 + self.time_ada_final = nn.Linear( + embed_dim, 2 * embed_dim, bias=True + ) + if time_fusion in ['ada_single', 'ada_sola', 'ada_sola_bias']: + # shared adaln + self.time_ada = nn.Linear(embed_dim, 6 * embed_dim, bias=True) + else: + self.time_ada = None + else: + raise NotImplementedError + logger.info(f'time fusion mode: {self.time_fusion}') + + # context + # use a simple projection + self.use_context = False + self.context_cross = False + self.context_max_length = context_max_length + self.context_fusion = 'none' + if context_dim is not None: + self.use_context = True + self.context_embed = nn.Sequential( + nn.Linear(context_dim, embed_dim, bias=True), + nn.SiLU(), + nn.Linear(embed_dim, embed_dim, bias=True), + ) + self.context_fusion = context_fusion + if context_fusion == 'concat' or context_fusion == 'joint': + self.extras += context_max_length + self.context_pe = PE_wrapper( + dim=embed_dim, + method=context_pe_method, + length=context_max_length + ) + # no cross attention layers + context_dim = None + elif context_fusion == 'cross': + self.context_pe = PE_wrapper( + dim=embed_dim, + method=context_pe_method, + length=context_max_length + ) + self.context_cross = True + context_dim = embed_dim + else: + raise NotImplementedError + logger.info(f'context fusion mode: {context_fusion}') + logger.info(f'context position embedding: {context_pe_method}') + + self.use_skip = skip + + # norm layers + if norm_layer == 'layernorm': + norm_layer = nn.LayerNorm + elif norm_layer == 'rmsnorm': + norm_layer = RMSNorm + else: + raise NotImplementedError + + logger.info(f'use long skip connection: {skip}') + self.in_blocks = nn.ModuleList([ + DiTBlock( + dim=embed_dim, + context_dim=context_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + qk_norm=qk_norm, + act_layer=act_layer, + norm_layer=norm_layer, + time_fusion=time_fusion, + ada_sola_rank=ada_sola_rank, + ada_sola_alpha=ada_sola_alpha, + skip=False, + skip_norm=False, + rope_mode=self.rope, + context_norm=context_norm, + use_checkpoint=use_checkpoint + ) for _ in range(depth // 2) + ]) + + self.mid_block = DiTBlock( + dim=embed_dim, + context_dim=context_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + qk_norm=qk_norm, + act_layer=act_layer, + norm_layer=norm_layer, + time_fusion=time_fusion, + ada_sola_rank=ada_sola_rank, + ada_sola_alpha=ada_sola_alpha, + skip=False, + skip_norm=False, + rope_mode=self.rope, + context_norm=context_norm, + use_checkpoint=use_checkpoint + ) + + self.out_blocks = nn.ModuleList([ + DiTBlock( + dim=embed_dim, + context_dim=context_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + qk_norm=qk_norm, + act_layer=act_layer, + norm_layer=norm_layer, + time_fusion=time_fusion, + ada_sola_rank=ada_sola_rank, + ada_sola_alpha=ada_sola_alpha, + skip=skip, + skip_norm=skip_norm, + rope_mode=self.rope, + context_norm=context_norm, + use_checkpoint=use_checkpoint + ) for _ in range(depth // 2) + ]) + + # FinalLayer block + self.use_conv = use_conv + self.final_block = FinalBlock( + embed_dim=embed_dim, + patch_size=patch_size, + img_size=img_size, + in_chans=out_chans, + input_type=input_type, + norm_layer=norm_layer, + use_conv=use_conv, + use_adanorm=self.use_adanorm + ) + self.initialize_weights() + + def _init_ada(self): + if self.time_fusion == 'ada': + nn.init.constant_(self.time_ada_final.weight, 0) + nn.init.constant_(self.time_ada_final.bias, 0) + for block in self.in_blocks: + nn.init.constant_(block.adaln.time_ada.weight, 0) + nn.init.constant_(block.adaln.time_ada.bias, 0) + nn.init.constant_(self.mid_block.adaln.time_ada.weight, 0) + nn.init.constant_(self.mid_block.adaln.time_ada.bias, 0) + for block in self.out_blocks: + nn.init.constant_(block.adaln.time_ada.weight, 0) + nn.init.constant_(block.adaln.time_ada.bias, 0) + elif self.time_fusion == 'ada_single': + nn.init.constant_(self.time_ada.weight, 0) + nn.init.constant_(self.time_ada.bias, 0) + nn.init.constant_(self.time_ada_final.weight, 0) + nn.init.constant_(self.time_ada_final.bias, 0) + elif self.time_fusion in ['ada_sola', 'ada_sola_bias']: + nn.init.constant_(self.time_ada.weight, 0) + nn.init.constant_(self.time_ada.bias, 0) + nn.init.constant_(self.time_ada_final.weight, 0) + nn.init.constant_(self.time_ada_final.bias, 0) + for block in self.in_blocks: + nn.init.kaiming_uniform_( + block.adaln.lora_a.weight, a=math.sqrt(5) + ) + nn.init.constant_(block.adaln.lora_b.weight, 0) + nn.init.kaiming_uniform_( + self.mid_block.adaln.lora_a.weight, a=math.sqrt(5) + ) + nn.init.constant_(self.mid_block.adaln.lora_b.weight, 0) + for block in self.out_blocks: + nn.init.kaiming_uniform_( + block.adaln.lora_a.weight, a=math.sqrt(5) + ) + nn.init.constant_(block.adaln.lora_b.weight, 0) + + def initialize_weights(self): + # Basic init for all layers + def _basic_init(module): + if isinstance(module, nn.Linear): + torch.nn.init.xavier_uniform_(module.weight) + if module.bias is not None: + nn.init.constant_(module.bias, 0) + + self.apply(_basic_init) + + # init patch Conv like Linear + w = self.patch_embed.proj.weight.data + nn.init.xavier_uniform_(w.view([w.shape[0], -1])) + nn.init.constant_(self.patch_embed.proj.bias, 0) + + # Zero-out AdaLN + if self.use_adanorm: + self._init_ada() + + # Zero-out Cross Attention + if self.context_cross: + for block in self.in_blocks: + nn.init.constant_(block.cross_attn.proj.weight, 0) + nn.init.constant_(block.cross_attn.proj.bias, 0) + nn.init.constant_(self.mid_block.cross_attn.proj.weight, 0) + nn.init.constant_(self.mid_block.cross_attn.proj.bias, 0) + for block in self.out_blocks: + nn.init.constant_(block.cross_attn.proj.weight, 0) + nn.init.constant_(block.cross_attn.proj.bias, 0) + + # Zero-out cls embedding + if self.cls_embed: + if self.use_adanorm: + nn.init.constant_(self.cls_embed[-1].weight, 0) + nn.init.constant_(self.cls_embed[-1].bias, 0) + + # Zero-out Output + # might not zero-out this when using v-prediction + # it could be good when using noise-prediction + # nn.init.constant_(self.final_block.linear.weight, 0) + # nn.init.constant_(self.final_block.linear.bias, 0) + # if self.use_conv: + # nn.init.constant_(self.final_block.final_layer.weight.data, 0) + # nn.init.constant_(self.final_block.final_layer.bias, 0) + + # init out Conv + if self.use_conv: + nn.init.xavier_uniform_(self.final_block.final_layer.weight) + nn.init.constant_(self.final_block.final_layer.bias, 0) + + def _concat_x_context(self, x, context, x_mask=None, context_mask=None): + assert context.shape[-2] == self.context_max_length + # Check if either x_mask or context_mask is provided + B = x.shape[0] + # Create default masks if they are not provided + if x_mask is None: + x_mask = torch.ones(B, x.shape[-2], device=x.device).bool() + if context_mask is None: + context_mask = torch.ones( + B, context.shape[-2], device=context.device + ).bool() + # Concatenate the masks along the second dimension (dim=1) + x_mask = torch.cat([context_mask, x_mask], dim=1) + # Concatenate context and x along the second dimension (dim=1) + x = torch.cat((context, x), dim=1) + return x, x_mask + + def forward( + self, + x, + timesteps, + context, + x_mask=None, + context_mask=None, + cls_token=None, + controlnet_skips=None, + ): + # make it compatible with int time step during inference + if timesteps.dim() == 0: + timesteps = timesteps.expand(x.shape[0] + ).to(x.device, dtype=torch.long) + + x = self.patch_embed(x) + x = self.x_pe(x) + + B, L, D = x.shape + + if self.use_context: + context_token = self.context_embed(context) + context_token = self.context_pe(context_token) + if self.context_fusion == 'concat' or self.context_fusion == 'joint': + x, x_mask = self._concat_x_context( + x=x, + context=context_token, + x_mask=x_mask, + context_mask=context_mask + ) + context_token, context_mask = None, None + else: + context_token, context_mask = None, None + + time_token = self.time_embed(timesteps) + if self.cls_embed: + cls_token = self.cls_embed(cls_token) + time_ada = None + time_ada_final = None + if self.use_adanorm: + if self.cls_embed: + time_token = time_token + cls_token + time_token = self.time_act(time_token) + time_ada_final = self.time_ada_final(time_token) + if self.time_ada is not None: + time_ada = self.time_ada(time_token) + else: + time_token = time_token.unsqueeze(dim=1) + if self.cls_embed: + cls_token = cls_token.unsqueeze(dim=1) + time_token = torch.cat([time_token, cls_token], dim=1) + time_token = self.time_pe(time_token) + x = torch.cat((time_token, x), dim=1) + if x_mask is not None: + x_mask = torch.cat([ + torch.ones(B, time_token.shape[1], + device=x_mask.device).bool(), x_mask + ], + dim=1) + time_token = None + + skips = [] + for blk in self.in_blocks: + x = blk( + x=x, + time_token=time_token, + time_ada=time_ada, + skip=None, + context=context_token, + x_mask=x_mask, + context_mask=context_mask, + extras=self.extras + ) + if self.use_skip: + skips.append(x) + + x = self.mid_block( + x=x, + time_token=time_token, + time_ada=time_ada, + skip=None, + context=context_token, + x_mask=x_mask, + context_mask=context_mask, + extras=self.extras + ) + for blk in self.out_blocks: + if self.use_skip: + skip = skips.pop() + if controlnet_skips: + # add to skip like u-net controlnet + skip = skip + controlnet_skips.pop() + else: + skip = None + if controlnet_skips: + # directly add to x + x = x + controlnet_skips.pop() + + x = blk( + x=x, + time_token=time_token, + time_ada=time_ada, + skip=skip, + context=context_token, + x_mask=x_mask, + context_mask=context_mask, + extras=self.extras + ) + + x = self.final_block(x, time_ada=time_ada_final, extras=self.extras) + + return x + + +class MaskDiT(nn.Module): + def __init__( + self, + model: UDiT, + mae=False, + mae_prob=0.5, + mask_ratio=[0.25, 1.0], + mask_span=10, + ): + super().__init__() + self.model = model + self.mae = mae + if self.mae: + out_channel = model.out_chans + self.mask_embed = nn.Parameter(torch.zeros((out_channel))) + self.mae_prob = mae_prob + self.mask_ratio = mask_ratio + self.mask_span = mask_span + + def random_masking(self, gt, mask_ratios, mae_mask_infer=None): + B, D, L = gt.shape + if mae_mask_infer is None: + # mask = torch.rand(B, L).to(gt.device) < mask_ratios.unsqueeze(1) + mask_ratios = mask_ratios.cpu().numpy() + mask = compute_mask_indices( + shape=[B, L], + padding_mask=None, + mask_prob=mask_ratios, + mask_length=self.mask_span, + mask_type="static", + mask_other=0.0, + min_masks=1, + no_overlap=False, + min_space=0, + ) + mask = mask.unsqueeze(1).expand_as(gt) + else: + mask = mae_mask_infer + mask = mask.expand_as(gt) + gt[mask] = self.mask_embed.view(1, D, 1).expand_as(gt)[mask] + return gt, mask.type_as(gt) + + def forward( + self, + x, + timesteps, + context, + x_mask=None, + context_mask=None, + cls_token=None, + gt=None, + mae_mask_infer=None, + forward_model=True + ): + # todo: handle controlnet inside + mae_mask = torch.ones_like(x) + if self.mae: + if gt is not None: + B, D, L = gt.shape + mask_ratios = torch.FloatTensor(B).uniform_(*self.mask_ratio + ).to(gt.device) + gt, mae_mask = self.random_masking( + gt, mask_ratios, mae_mask_infer + ) + # apply mae only to the selected batches + if mae_mask_infer is None: + # determine mae batch + mae_batch = torch.rand(B) < self.mae_prob + gt[~mae_batch] = self.mask_embed.view( + 1, D, 1 + ).expand_as(gt)[~mae_batch] + mae_mask[~mae_batch] = 1.0 + else: + B, D, L = x.shape + gt = self.mask_embed.view(1, D, 1).expand_as(x) + x = torch.cat([x, gt, mae_mask[:, 0:1, :]], dim=1) + + if forward_model: + x = self.model( + x=x, + timesteps=timesteps, + context=context, + x_mask=x_mask, + context_mask=context_mask, + cls_token=cls_token + ) + # logger.info(mae_mask[:, 0, :].sum(dim=-1)) + return x, mae_mask diff --git a/models/dit/modules.py b/models/dit/modules.py new file mode 100644 index 0000000000000000000000000000000000000000..a2eec357ca6dfebf841768874b8cbd37112c3980 --- /dev/null +++ b/models/dit/modules.py @@ -0,0 +1,445 @@ +import warnings +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.utils.checkpoint +from torch.cuda.amp import autocast +import math +import einops +from einops import rearrange, repeat +from inspect import isfunction + + +def trunc_normal_(tensor, mean, std, a, b): + # Cut & paste from PyTorch official master until it's in a few official releases - RW + # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf + def norm_cdf(x): + # Computes standard normal cumulative distribution function + return (1. + math.erf(x / math.sqrt(2.))) / 2. + + if (mean < a - 2*std) or (mean > b + 2*std): + warnings.warn( + "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. " + "The distribution of values may be incorrect.", + stacklevel=2 + ) + + with torch.no_grad(): + # Values are generated by using a truncated uniform distribution and + # then using the inverse CDF for the normal distribution. + # Get upper and lower cdf values + l = norm_cdf((a-mean) / std) + u = norm_cdf((b-mean) / std) + + # Uniformly fill tensor with values from [l, u], then translate to + # [2l-1, 2u-1]. + tensor.uniform_(2*l - 1, 2*u - 1) + + # Use inverse cdf transform for normal distribution to get truncated + # standard normal + tensor.erfinv_() + + # Transform to proper mean, std + tensor.mul_(std * math.sqrt(2.)) + tensor.add_(mean) + + # Clamp to ensure it's in the proper range + tensor.clamp_(min=a, max=b) + return tensor + + +# disable in checkpoint mode +# @torch.jit.script +def film_modulate(x, shift, scale): + return x * (1+scale) + shift + + +def timestep_embedding(timesteps, dim, max_period=10000): + """ + Create sinusoidal timestep embeddings. + + :param timesteps: a 1-D Tensor of N indices, one per batch element. + These may be fractional. + :param dim: the dimension of the output. + :param max_period: controls the minimum frequency of the embeddings. + :return: an [N x dim] Tensor of positional embeddings. + """ + half = dim // 2 + freqs = torch.exp( + -math.log(max_period) * + torch.arange(start=0, end=half, dtype=torch.float32) / half + ).to(device=timesteps.device) + args = timesteps[:, None].float() * freqs[None] + embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1) + if dim % 2: + embedding = torch.cat([embedding, + torch.zeros_like(embedding[:, :1])], + dim=-1) + return embedding + + +class TimestepEmbedder(nn.Module): + """ + Embeds scalar timesteps into vector representations. + """ + def __init__( + self, hidden_size, frequency_embedding_size=256, out_size=None + ): + super().__init__() + if out_size is None: + out_size = hidden_size + self.mlp = nn.Sequential( + nn.Linear(frequency_embedding_size, hidden_size, bias=True), + nn.SiLU(), + nn.Linear(hidden_size, out_size, bias=True), + ) + self.frequency_embedding_size = frequency_embedding_size + + def forward(self, t): + t_freq = timestep_embedding(t, self.frequency_embedding_size).type( + self.mlp[0].weight.dtype + ) + t_emb = self.mlp(t_freq) + return t_emb + + +def patchify(imgs, patch_size, input_type='2d'): + if input_type == '2d': + x = einops.rearrange( + imgs, + 'B C (h p1) (w p2) -> B (h w) (p1 p2 C)', + p1=patch_size, + p2=patch_size + ) + elif input_type == '1d': + x = einops.rearrange(imgs, 'B C (h p1) -> B h (p1 C)', p1=patch_size) + return x + + +def unpatchify(x, channels=3, input_type='2d', img_size=None): + if input_type == '2d': + patch_size = int((x.shape[2] // channels)**0.5) + # h = w = int(x.shape[1] ** .5) + h, w = img_size[0] // patch_size, img_size[1] // patch_size + assert h * w == x.shape[1] and patch_size**2 * channels == x.shape[2] + x = einops.rearrange( + x, + 'B (h w) (p1 p2 C) -> B C (h p1) (w p2)', + h=h, + p1=patch_size, + p2=patch_size + ) + elif input_type == '1d': + patch_size = int((x.shape[2] // channels)) + h = x.shape[1] + assert patch_size * channels == x.shape[2] + x = einops.rearrange(x, 'B h (p1 C) -> B C (h p1)', h=h, p1=patch_size) + return x + + +class PatchEmbed(nn.Module): + """ + Image to Patch Embedding + """ + def __init__(self, patch_size, in_chans=3, embed_dim=768, input_type='2d'): + super().__init__() + self.patch_size = patch_size + self.input_type = input_type + if input_type == '2d': + self.proj = nn.Conv2d( + in_chans, + embed_dim, + kernel_size=patch_size, + stride=patch_size, + bias=True + ) + elif input_type == '1d': + self.proj = nn.Conv1d( + in_chans, + embed_dim, + kernel_size=patch_size, + stride=patch_size, + bias=True + ) + + def forward(self, x): + if self.input_type == '2d': + B, C, H, W = x.shape + assert H % self.patch_size == 0 and W % self.patch_size == 0 + elif self.input_type == '1d': + B, C, H = x.shape + assert H % self.patch_size == 0 + + x = self.proj(x).flatten(2).transpose(1, 2) + return x + + +class PositionalConvEmbedding(nn.Module): + """ + Relative positional embedding used in HuBERT + """ + def __init__(self, dim=768, kernel_size=128, groups=16): + super().__init__() + self.conv = nn.Conv1d( + dim, + dim, + kernel_size=kernel_size, + padding=kernel_size // 2, + groups=groups, + bias=True + ) + self.conv = nn.utils.parametrizations.weight_norm( + self.conv, name="weight", dim=2 + ) + + def forward(self, x): + # B C T + x = self.conv(x) + x = F.gelu(x[:, :, :-1]) + return x + + +class SinusoidalPositionalEncoding(nn.Module): + def __init__(self, dim, length): + super(SinusoidalPositionalEncoding, self).__init__() + self.length = length + self.dim = dim + self.register_buffer( + 'pe', self._generate_positional_encoding(length, dim) + ) + + def _generate_positional_encoding(self, length, dim): + pe = torch.zeros(length, dim) + position = torch.arange(0, length, dtype=torch.float).unsqueeze(1) + div_term = torch.exp( + torch.arange(0, dim, 2).float() * (-math.log(10000.0) / dim) + ) + + pe[:, 0::2] = torch.sin(position * div_term) + pe[:, 1::2] = torch.cos(position * div_term) + + pe = pe.unsqueeze(0) + return pe + + def forward(self, x): + x = x + self.pe[:, :x.size(1)] + return x + + +class PE_wrapper(nn.Module): + def __init__(self, dim=768, method='abs', length=None, **kwargs): + super().__init__() + self.method = method + if method == 'abs': + # init absolute pe like UViT + self.length = length + self.abs_pe = nn.Parameter(torch.zeros(1, length, dim)) + trunc_normal_(self.abs_pe, std=.02) + elif method == 'conv': + self.conv_pe = PositionalConvEmbedding(dim=dim, **kwargs) + elif method == 'sinu': + self.sinu_pe = SinusoidalPositionalEncoding(dim=dim, length=length) + elif method == 'none': + # skip pe + self.id = nn.Identity() + else: + raise NotImplementedError + + def forward(self, x): + if self.method == 'abs': + _, L, _ = x.shape + assert L <= self.length + x = x + self.abs_pe[:, :L, :] + elif self.method == 'conv': + x = x + self.conv_pe(x) + elif self.method == 'sinu': + x = self.sinu_pe(x) + elif self.method == 'none': + x = self.id(x) + else: + raise NotImplementedError + return x + + +class RMSNorm(torch.nn.Module): + def __init__(self, dim: int, eps: float = 1e-6): + """ + Initialize the RMSNorm normalization layer. + + Args: + dim (int): The dimension of the input tensor. + eps (float, optional): A small value added to the denominator for numerical stability. Default is 1e-6. + + Attributes: + eps (float): A small value added to the denominator for numerical stability. + weight (nn.Parameter): Learnable scaling parameter. + + """ + super().__init__() + self.eps = eps + self.weight = nn.Parameter(torch.ones(dim)) + + def _norm(self, x): + """ + Apply the RMSNorm normalization to the input tensor. + + Args: + x (torch.Tensor): The input tensor. + + Returns: + torch.Tensor: The normalized tensor. + + """ + return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps) + + def forward(self, x): + """ + Forward pass through the RMSNorm layer. + + Args: + x (torch.Tensor): The input tensor. + + Returns: + torch.Tensor: The output tensor after applying RMSNorm. + + """ + output = self._norm(x.float()).type_as(x) + return output * self.weight + + +class GELU(nn.Module): + def __init__( + self, + dim_in: int, + dim_out: int, + approximate: str = "none", + bias: bool = True + ): + super().__init__() + self.proj = nn.Linear(dim_in, dim_out, bias=bias) + self.approximate = approximate + + def gelu(self, gate: torch.Tensor) -> torch.Tensor: + if gate.device.type != "mps": + return F.gelu(gate, approximate=self.approximate) + # mps: gelu is not implemented for float16 + return F.gelu( + gate.to(dtype=torch.float32), approximate=self.approximate + ).to(dtype=gate.dtype) + + def forward(self, hidden_states): + hidden_states = self.proj(hidden_states) + hidden_states = self.gelu(hidden_states) + return hidden_states + + +class GEGLU(nn.Module): + def __init__(self, dim_in: int, dim_out: int, bias: bool = True): + super().__init__() + self.proj = nn.Linear(dim_in, dim_out * 2, bias=bias) + + def gelu(self, gate: torch.Tensor) -> torch.Tensor: + if gate.device.type != "mps": + return F.gelu(gate) + # mps: gelu is not implemented for float16 + return F.gelu(gate.to(dtype=torch.float32)).to(dtype=gate.dtype) + + def forward(self, hidden_states): + hidden_states = self.proj(hidden_states) + hidden_states, gate = hidden_states.chunk(2, dim=-1) + return hidden_states * self.gelu(gate) + + +class ApproximateGELU(nn.Module): + def __init__(self, dim_in: int, dim_out: int, bias: bool = True): + super().__init__() + self.proj = nn.Linear(dim_in, dim_out, bias=bias) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.proj(x) + return x * torch.sigmoid(1.702 * x) + + +# disable in checkpoint mode +# @torch.jit.script +def snake_beta(x, alpha, beta): + return x + beta * torch.sin(x * alpha).pow(2) + + +class Snake(nn.Module): + def __init__(self, dim_in, dim_out, bias, alpha_trainable=True): + super().__init__() + self.proj = nn.Linear(dim_in, dim_out, bias=bias) + self.alpha = nn.Parameter(torch.ones(1, 1, dim_out)) + self.beta = nn.Parameter(torch.ones(1, 1, dim_out)) + self.alpha.requires_grad = alpha_trainable + self.beta.requires_grad = alpha_trainable + + def forward(self, x): + x = self.proj(x) + x = snake_beta(x, self.alpha, self.beta) + return x + + +class GESnake(nn.Module): + def __init__(self, dim_in, dim_out, bias, alpha_trainable=True): + super().__init__() + self.proj = nn.Linear(dim_in, dim_out * 2, bias=bias) + self.alpha = nn.Parameter(torch.ones(1, 1, dim_out)) + self.beta = nn.Parameter(torch.ones(1, 1, dim_out)) + self.alpha.requires_grad = alpha_trainable + self.beta.requires_grad = alpha_trainable + + def forward(self, x): + x = self.proj(x) + x, gate = x.chunk(2, dim=-1) + return x * snake_beta(gate, self.alpha, self.beta) + + +class FeedForward(nn.Module): + def __init__( + self, + dim, + dim_out=None, + mult=4, + dropout=0.0, + activation_fn="geglu", + final_dropout=False, + inner_dim=None, + bias=True, + ): + super().__init__() + if inner_dim is None: + inner_dim = int(dim * mult) + dim_out = dim_out if dim_out is not None else dim + + if activation_fn == "gelu": + act_fn = GELU(dim, inner_dim, bias=bias) + elif activation_fn == "gelu-approximate": + act_fn = GELU(dim, inner_dim, approximate="tanh", bias=bias) + elif activation_fn == "geglu": + act_fn = GEGLU(dim, inner_dim, bias=bias) + elif activation_fn == "geglu-approximate": + act_fn = ApproximateGELU(dim, inner_dim, bias=bias) + elif activation_fn == "snake": + act_fn = Snake(dim, inner_dim, bias=bias) + elif activation_fn == "gesnake": + act_fn = GESnake(dim, inner_dim, bias=bias) + else: + raise NotImplementedError + + self.net = nn.ModuleList([]) + # project in + self.net.append(act_fn) + # project dropout + self.net.append(nn.Dropout(dropout)) + # project out + self.net.append(nn.Linear(inner_dim, dim_out, bias=bias)) + # FF as used in Vision Transformer, MLP-Mixer, etc. have a final dropout + if final_dropout: + self.net.append(nn.Dropout(dropout)) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + for module in self.net: + hidden_states = module(hidden_states) + return hidden_states diff --git a/models/dit/rotary.py b/models/dit/rotary.py new file mode 100644 index 0000000000000000000000000000000000000000..f539185c22e715d5a7ac66772ffa9f15a1e5df35 --- /dev/null +++ b/models/dit/rotary.py @@ -0,0 +1,88 @@ +import torch +"this rope is faster than llama rope with jit script" + + +def rotate_half(x): + x1, x2 = x.chunk(2, dim=-1) + return torch.cat((-x2, x1), dim=-1) + + +# disable in checkpoint mode +# @torch.jit.script +def apply_rotary_pos_emb(x, cos, sin): + # NOTE: This could probably be moved to Triton + # Handle a possible sequence length mismatch in between q and k + cos = cos[:, :, :x.shape[-2], :] + sin = sin[:, :, :x.shape[-2], :] + return (x*cos) + (rotate_half(x) * sin) + + +class RotaryEmbedding(torch.nn.Module): + """ + The rotary position embeddings from RoFormer_ (Su et. al). + A crucial insight from the method is that the query and keys are + transformed by rotation matrices which depend on the relative positions. + + Other implementations are available in the Rotary Transformer repo_ and in + GPT-NeoX_, GPT-NeoX was an inspiration + + .. _RoFormer: https://arxiv.org/abs/2104.09864 + .. _repo: https://github.com/ZhuiyiTechnology/roformer + .. _GPT-NeoX: https://github.com/EleutherAI/gpt-neox + + + .. warning: Please note that this embedding is not registered on purpose, as it is transformative + (it does not create the embedding dimension) and will likely be picked up (imported) on a ad-hoc basis + """ + def __init__(self, dim: int): + super().__init__() + # Generate and save the inverse frequency buffer (non trainable) + inv_freq = 1.0 / (10000**(torch.arange(0, dim, 2).float() / dim)) + self.register_buffer("inv_freq", inv_freq) + self._seq_len_cached = None + self._cos_cached = None + self._sin_cached = None + + def _update_cos_sin_tables(self, x, seq_dimension=-2): + # expect input: B, H, L, D + seq_len = x.shape[seq_dimension] + + # Reset the tables if the sequence length has changed, + # or if we're on a new device (possibly due to tracing for instance) + # also make sure dtype wont change + if ( + seq_len != self._seq_len_cached or + self._cos_cached.device != x.device or + self._cos_cached.dtype != x.dtype + ): + self._seq_len_cached = seq_len + t = torch.arange( + x.shape[seq_dimension], device=x.device, dtype=torch.float32 + ) + freqs = torch.einsum("i,j->ij", t, self.inv_freq.to(x.dtype)) + emb = torch.cat((freqs, freqs), dim=-1).to(x.device) + + self._cos_cached = emb.cos()[None, None, :, :].to(x.dtype) + self._sin_cached = emb.sin()[None, None, :, :].to(x.dtype) + + return self._cos_cached, self._sin_cached + + def forward(self, q, k): + self._cos_cached, self._sin_cached = self._update_cos_sin_tables( + q.float(), seq_dimension=-2 + ) + if k is not None: + return ( + apply_rotary_pos_emb( + q.float(), self._cos_cached, self._sin_cached + ).type_as(q), + apply_rotary_pos_emb( + k.float(), self._cos_cached, self._sin_cached + ).type_as(k), + ) + else: + return ( + apply_rotary_pos_emb( + q.float(), self._cos_cached, self._sin_cached + ).type_as(q), None + ) diff --git a/models/dit/span_mask.py b/models/dit/span_mask.py new file mode 100644 index 0000000000000000000000000000000000000000..c0832567c3e4dcc0c49fdd88dadff11c80d8e2a0 --- /dev/null +++ b/models/dit/span_mask.py @@ -0,0 +1,149 @@ +import numpy as np +import torch +from typing import Optional, Tuple + + +def compute_mask_indices( + shape: Tuple[int, int], + padding_mask: Optional[torch.Tensor], + mask_prob: float, + mask_length: int, + mask_type: str = "static", + mask_other: float = 0.0, + min_masks: int = 0, + no_overlap: bool = False, + min_space: int = 0, +) -> np.ndarray: + """ + Computes random mask spans for a given shape + + Args: + shape: the the shape for which to compute masks. + should be of size 2 where first element is batch size and 2nd is timesteps + padding_mask: optional padding mask of the same size as shape, which will prevent masking padded elements + mask_prob: probability for each token to be chosen as start of the span to be masked. this will be multiplied by + number of timesteps divided by length of mask span to mask approximately this percentage of all elements. + however due to overlaps, the actual number will be smaller (unless no_overlap is True) + mask_type: how to compute mask lengths + static = fixed size + uniform = sample from uniform distribution [mask_other, mask_length*2] + normal = sample from normal distribution with mean mask_length and stdev mask_other. mask is min 1 element + poisson = sample from possion distribution with lambda = mask length + min_masks: minimum number of masked spans + no_overlap: if false, will switch to an alternative recursive algorithm that prevents spans from overlapping + min_space: only used if no_overlap is True, this is how many elements to keep unmasked between spans + """ + + bsz, all_sz = shape + mask = np.full((bsz, all_sz), False) + + # Convert mask_prob to a NumPy array + mask_prob = np.array(mask_prob) + + # Calculate all_num_mask for each element in the batch + all_num_mask = np.floor( + mask_prob * all_sz / float(mask_length) + np.random.rand(bsz) + ).astype(int) + + # Apply the max operation with min_masks for each element + all_num_mask = np.maximum(min_masks, all_num_mask) + + mask_idcs = [] + for i in range(bsz): + if padding_mask is not None: + sz = all_sz - padding_mask[i].long().sum().item() + num_mask = int( + # add a random number for probabilistic rounding + mask_prob * sz / float(mask_length) + np.random.rand() + ) + num_mask = max(min_masks, num_mask) + else: + sz = all_sz + num_mask = all_num_mask[i] + + if mask_type == "static": + lengths = np.full(num_mask, mask_length) + elif mask_type == "uniform": + lengths = np.random.randint( + mask_other, mask_length*2 + 1, size=num_mask + ) + elif mask_type == "normal": + lengths = np.random.normal(mask_length, mask_other, size=num_mask) + lengths = [max(1, int(round(x))) for x in lengths] + elif mask_type == "poisson": + lengths = np.random.poisson(mask_length, size=num_mask) + lengths = [int(round(x)) for x in lengths] + else: + raise Exception("unknown mask selection " + mask_type) + + if sum(lengths) == 0: + lengths[0] = min(mask_length, sz - 1) + + if no_overlap: + mask_idc = [] + + def arrange(s, e, length, keep_length): + span_start = np.random.randint(s, e - length) + mask_idc.extend(span_start + i for i in range(length)) + + new_parts = [] + if span_start - s - min_space >= keep_length: + new_parts.append((s, span_start - min_space + 1)) + if e - span_start - keep_length - min_space > keep_length: + new_parts.append((span_start + length + min_space, e)) + return new_parts + + parts = [(0, sz)] + min_length = min(lengths) + for length in sorted(lengths, reverse=True): + lens = np.fromiter( + ( + e - s if e - s >= length + min_space else 0 + for s, e in parts + ), + np.int, + ) + l_sum = np.sum(lens) + if l_sum == 0: + break + probs = lens / np.sum(lens) + c = np.random.choice(len(parts), p=probs) + s, e = parts.pop(c) + parts.extend(arrange(s, e, length, min_length)) + mask_idc = np.asarray(mask_idc) + else: + min_len = min(lengths) + if sz - min_len <= num_mask: + min_len = sz - num_mask - 1 + + mask_idc = np.random.choice(sz - min_len, num_mask, replace=False) + + mask_idc = np.asarray([ + mask_idc[j] + offset for j in range(len(mask_idc)) + for offset in range(lengths[j]) + ]) + + mask_idcs.append(np.unique(mask_idc[mask_idc < sz])) + # min_len = min([len(m) for m in mask_idcs]) + for i, mask_idc in enumerate(mask_idcs): + # if len(mask_idc) > min_len: + # mask_idc = np.random.choice(mask_idc, min_len, replace=False) + mask[i, mask_idc] = True + + return torch.tensor(mask) + + +if __name__ == '__main__': + mask = compute_mask_indices( + shape=[4, 500], + padding_mask=None, + mask_prob=[0.65, 0.5, 0.65, 0.65], + mask_length=10, + mask_type="static", + mask_other=0.0, + min_masks=1, + no_overlap=False, + min_space=0, + ) + print(mask) + print(mask.sum(dim=1)) diff --git a/requirement.txt b/requirement.txt new file mode 100644 index 0000000000000000000000000000000000000000..36737bc8bdaec34b084ff5886a8f37600f06210f --- /dev/null +++ b/requirement.txt @@ -0,0 +1,11 @@ +gradio==5.44.1 +torch==2.3.0 +torchaudio==2.3.0 +librosa +soundfile==0.13.1 +numpy==1.26.4 +requests==2.28.1 +tqdm==4.67.1 +einops==0.8.1 +diffusers==0.35.1 +alias_free_torch==0.0.6 \ No newline at end of file diff --git a/stabilityai-stable-diffusion-2-1/scheduler/scheduler_config.json b/stabilityai-stable-diffusion-2-1/scheduler/scheduler_config.json new file mode 100644 index 0000000000000000000000000000000000000000..536b82b4e3c62c4898b4ac8725bc514f2a98f5de --- /dev/null +++ b/stabilityai-stable-diffusion-2-1/scheduler/scheduler_config.json @@ -0,0 +1,14 @@ +{ + "_class_name": "DDIMScheduler", + "_diffusers_version": "0.8.0", + "beta_end": 0.012, + "beta_schedule": "scaled_linear", + "beta_start": 0.00085, + "clip_sample": false, + "num_train_timesteps": 1000, + "prediction_type": "v_prediction", + "set_alpha_to_one": false, + "skip_prk_steps": true, + "steps_offset": 1, + "trained_betas": null +} diff --git a/utils/__pycache__/accelerate_utilities.cpython-310.pyc b/utils/__pycache__/accelerate_utilities.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8c605a3475f9e51b5ee0334105b5f07e68f75b1d Binary files /dev/null and b/utils/__pycache__/accelerate_utilities.cpython-310.pyc differ diff --git a/utils/__pycache__/audiotime_event.cpython-310.pyc b/utils/__pycache__/audiotime_event.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8261001c40ce7f6e30293add221ba7c8e7eecc98 Binary files /dev/null and b/utils/__pycache__/audiotime_event.cpython-310.pyc differ diff --git a/utils/__pycache__/audiotime_event_merge.cpython-310.pyc b/utils/__pycache__/audiotime_event_merge.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9eddba4297a5a8cb352202a3c7ac8d887829cdca Binary files /dev/null and b/utils/__pycache__/audiotime_event_merge.cpython-310.pyc differ diff --git a/utils/__pycache__/audiotime_event_new.cpython-310.pyc b/utils/__pycache__/audiotime_event_new.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4cb72e4b63fa814972007eadca37b904d3b718d8 Binary files /dev/null and b/utils/__pycache__/audiotime_event_new.cpython-310.pyc differ diff --git a/utils/__pycache__/config.cpython-310.pyc b/utils/__pycache__/config.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..35960a066c1728f47be4dc5bb34aa9542d57f0ad Binary files /dev/null and b/utils/__pycache__/config.cpython-310.pyc differ diff --git a/utils/__pycache__/diffsinger_utilities.cpython-310.pyc b/utils/__pycache__/diffsinger_utilities.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..32fe663324cb50dc21e242028664d458819460ca Binary files /dev/null and b/utils/__pycache__/diffsinger_utilities.cpython-310.pyc differ diff --git a/utils/__pycache__/filter_data.cpython-310.pyc b/utils/__pycache__/filter_data.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cd22bc6b77d1cee0b356317ac2241bb0f5073c16 Binary files /dev/null and b/utils/__pycache__/filter_data.cpython-310.pyc differ diff --git a/utils/__pycache__/llm.cpython-310.pyc b/utils/__pycache__/llm.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..59a1862f7bfcae5bc77fe8fc6d864f07ac02f08a Binary files /dev/null and b/utils/__pycache__/llm.cpython-310.pyc differ diff --git a/utils/__pycache__/llm_xiapi.cpython-310.pyc b/utils/__pycache__/llm_xiapi.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..12619d242e4d5721459855251807fbc6d835de70 Binary files /dev/null and b/utils/__pycache__/llm_xiapi.cpython-310.pyc differ diff --git a/utils/__pycache__/log_helper.cpython-310.pyc b/utils/__pycache__/log_helper.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..653ff16ba4b0ed67826f83e510c5a41204e649a7 Binary files /dev/null and b/utils/__pycache__/log_helper.cpython-310.pyc differ diff --git a/utils/__pycache__/logging.cpython-310.pyc b/utils/__pycache__/logging.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bff87f124956fe7681bb5eb52dffdbea99c833f2 Binary files /dev/null and b/utils/__pycache__/logging.cpython-310.pyc differ diff --git a/utils/__pycache__/logging.cpython-313.pyc b/utils/__pycache__/logging.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..660c158c3b26f04a0c768c859a19be65c8f4fda2 Binary files /dev/null and b/utils/__pycache__/logging.cpython-313.pyc differ diff --git a/utils/__pycache__/lr_scheduler_utilities.cpython-310.pyc b/utils/__pycache__/lr_scheduler_utilities.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..23d865c2549b16fded100e464af53bc849726bfb Binary files /dev/null and b/utils/__pycache__/lr_scheduler_utilities.cpython-310.pyc differ diff --git a/utils/__pycache__/torch_utilities.cpython-310.pyc b/utils/__pycache__/torch_utilities.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..56c9a405867cd1d72c76d84b137fac30a2e1f6a8 Binary files /dev/null and b/utils/__pycache__/torch_utilities.cpython-310.pyc differ diff --git a/utils/accelerate_utilities.py b/utils/accelerate_utilities.py new file mode 100644 index 0000000000000000000000000000000000000000..99fc5aa3ad2700361e006799c6aac8119c9bb15a --- /dev/null +++ b/utils/accelerate_utilities.py @@ -0,0 +1,13 @@ +from accelerate import Accelerator + + +class AcceleratorSaveTrainableParams(Accelerator): + def get_state_dict(self, model, unwrap=True): + state_dict = super().get_state_dict(model, unwrap) + if hasattr(model, "param_names_to_save"): + param_names_to_save = model.param_names_to_save + return { + k: v + for k, v in state_dict.items() if k in param_names_to_save + } + return state_dict diff --git a/utils/audio.py b/utils/audio.py new file mode 100644 index 0000000000000000000000000000000000000000..350a9fe08e229a2f979f8090e314216c6b356739 --- /dev/null +++ b/utils/audio.py @@ -0,0 +1,58 @@ +import torch +import torch.nn as nn +import torchaudio + + +class PadCrop(nn.Module): + def __init__(self, n_samples, randomize=True): + super().__init__() + self.n_samples = n_samples + self.randomize = randomize + + def __call__(self, signal): + n, s = signal.shape + start = 0 if ( + not self.randomize + ) else torch.randint(0, + max(0, s - self.n_samples) + 1, []).item() + end = start + self.n_samples + output = signal.new_zeros([n, self.n_samples]) + output[:, :min(s, self.n_samples)] = signal[:, start:end] + return output + + +def set_audio_channels(audio, target_channels): + if target_channels == 1: + # Convert to mono + audio = audio.mean(1, keepdim=True) + elif target_channels == 2: + # Convert to stereo + if audio.shape[1] == 1: + audio = audio.repeat(1, 2, 1) + elif audio.shape[1] > 2: + audio = audio[:, :2, :] + return audio + + +def prepare_audio( + audio, in_sr, target_sr, target_length, target_channels, device +): + + audio = audio.to(device) + + if in_sr != target_sr: + resample_tf = torchaudio.transforms.Resample(in_sr, + target_sr).to(device) + audio = resample_tf(audio) + + audio = PadCrop(target_length, randomize=False)(audio) + + # Add batch dimension + if audio.dim() == 1: + audio = audio.unsqueeze(0).unsqueeze(0) + elif audio.dim() == 2: + audio = audio.unsqueeze(0) + + audio = set_audio_channels(audio, target_channels) + + return audio diff --git a/utils/audiotime_event_merge.py b/utils/audiotime_event_merge.py new file mode 100644 index 0000000000000000000000000000000000000000..74c58afffe49e40248f32fdec03c55651470fb3f --- /dev/null +++ b/utils/audiotime_event_merge.py @@ -0,0 +1,99 @@ +import json + +def get_event_synonyms(): + file_path = "./utils/merge_content.json" + with open(file_path, "r", encoding="utf-8") as file: + data = json.load(file) + + result = {} + for item in data: + event = item.get("event") + phrases = item.get("phrases", []) + result[event] = phrases + + return result + + +import random +import re + +def replace_event_synonyms(caption, onset): + """ + Replace event names in both caption(TCC) and onset(TDC) string with corresponding free text descriptions. + + Args: + caption (str): Caption text containing event names. + onset (str): Onset string, formatted as "event__start-end--event2__start-end". + + Returns: + new_caption (str): Caption with event names replaced by descriptions. + new_onset (str): Onset string with event names replaced by descriptions. + + Notes: + - Synonyms are fetched using get_event_synonyms(). + - For each event, a random synonym is chosen. + - All occurrences in caption (with correct pluralization) and onset are replaced. + """ + event_pattern = r"([a-zA-Z_()\s]+?)__((?:[\d\.\-]+_?)+)(?=--|$)" + events = re.findall(event_pattern, onset) + synonyms_dict = get_event_synonyms() + replacements = {} + # Choose a random synonym for each unique event + for event_name, _ in events: + if event_name not in replacements: + candidates = synonyms_dict.get(event_name, [event_name]) + replacements[event_name] = random.choice(candidates) + # Replace event names in the onset string + new_onset = "--".join([ + f"{replacements[event]}__{timestamps}" + for event, timestamps in events + ]) + # Replace event names in the caption, handling plural forms and case + new_caption = caption + for orig, repl in replacements.items(): + orig_space = orig.replace("_", " ") + repl_space = repl.replace("_", " ") + + escaped_orig_space = re.escape(orig_space) + + pattern = rf"(? None: + """ + Register custom resolver for hydra configs, which can be used in YAML + files for dynamically setting values + """ + omegaconf.OmegaConf.clear_resolvers() + omegaconf.OmegaConf.register_new_resolver("len", len, replace=True) + omegaconf.OmegaConf.register_new_resolver( + "multiply", multiply, replace=True + ) + omegaconf.OmegaConf.register_new_resolver( + "get_pitch_downsample_ratio", get_pitch_downsample_ratio, replace=True + ) + + +def generate_config_from_command_line_overrides( + config_file: Union[str, Path] +) -> omegaconf.DictConfig: + register_omegaconf_resolvers() + + config_file = Path(config_file).resolve() + config_name = config_file.name.__str__() + config_path = config_file.parent.__str__() + config_path = os.path.relpath(config_path, Path(__file__).resolve().parent) + + overrides = sys.argv[1:] + with hydra.initialize(version_base=None, config_path=config_path): + config = hydra.compose(config_name=config_name, overrides=overrides) + omegaconf.OmegaConf.resolve(config) + + return config diff --git a/utils/diffsinger_utilities.py b/utils/diffsinger_utilities.py new file mode 100644 index 0000000000000000000000000000000000000000..7d7d55426080aa495fe9212390599ff348ef4c69 --- /dev/null +++ b/utils/diffsinger_utilities.py @@ -0,0 +1,550 @@ +import six +from pathlib import Path +import re +import json +from collections import OrderedDict +from typing import Union + +import numpy as np +import librosa +import torch + +PAD = "" +EOS = "" +UNK = "" +SEG = "|" +RESERVED_TOKENS = [PAD, EOS, UNK] +NUM_RESERVED_TOKENS = len(RESERVED_TOKENS) +PAD_ID = RESERVED_TOKENS.index(PAD) # Normally 0 +EOS_ID = RESERVED_TOKENS.index(EOS) # Normally 1 +UNK_ID = RESERVED_TOKENS.index(UNK) # Normally 2 + +F0_BIN = 256 +F0_MAX = 1100.0 +F0_MIN = 50.0 +F0_MEL_MIN = 1127 * np.log(1 + F0_MIN/700) +F0_MEL_MAX = 1127 * np.log(1 + F0_MAX/700) + + +def f0_to_coarse(f0): + is_torch = isinstance(f0, torch.Tensor) + f0_mel = 1127 * (1 + + f0/700).log() if is_torch else 1127 * np.log(1 + f0/700) + f0_mel[f0_mel > 0 + ] = (f0_mel[f0_mel > 0] - + F0_MEL_MIN) * (F0_BIN-2) / (F0_MEL_MAX-F0_MEL_MIN) + 1 + + f0_mel[f0_mel <= 1] = 1 + f0_mel[f0_mel > F0_BIN - 1] = F0_BIN - 1 + f0_coarse = (f0_mel + + 0.5).long() if is_torch else np.rint(f0_mel).astype(np.int) + assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, ( + f0_coarse.max(), f0_coarse.min() + ) + return f0_coarse + + +def norm_f0( + f0: Union[np.ndarray, torch.Tensor], + uv: Union[None, np.ndarray], + f0_mean: float, + f0_std: float, + pitch_norm: str = "log", + use_uv: bool = True +): + is_torch = isinstance(f0, torch.Tensor) + if pitch_norm == 'standard': + f0 = (f0-f0_mean) / f0_std + if pitch_norm == 'log': + f0 = torch.log2(f0) if is_torch else np.log2(f0) + if uv is not None and use_uv: + f0[uv > 0] = 0 + return f0 + + +def norm_interp_f0( + f0: Union[np.ndarray, torch.Tensor], + f0_mean: float, + f0_std: float, + pitch_norm: str = "log", + use_uv: bool = True +): + is_torch = isinstance(f0, torch.Tensor) + if is_torch: + device = f0.device + f0 = f0.data.cpu().numpy() + uv = f0 == 0 + f0 = norm_f0(f0, uv, f0_mean, f0_std, pitch_norm, use_uv) + if sum(uv) == len(f0): + f0[uv] = 0 + elif sum(uv) > 0: + f0[uv] = np.interp(np.where(uv)[0], np.where(~uv)[0], f0[~uv]) + uv = torch.as_tensor(uv).float() + f0 = torch.as_tensor(f0).float() + if is_torch: + f0 = f0.to(device) + return f0, uv + + +def denorm_f0( + f0, + uv, + pitch_norm="log", + f0_mean=None, + f0_std=None, + pitch_padding=None, + min=None, + max=None, + use_uv=True +): + if pitch_norm == 'standard': + f0 = f0*f0_std + f0_mean + if pitch_norm == 'log': + f0 = 2**f0 + if min is not None: + f0 = f0.clamp(min=min) + if max is not None: + f0 = f0.clamp(max=max) + if uv is not None and use_uv: + f0[uv > 0] = 0 + if pitch_padding is not None: + f0[pitch_padding] = 0 + return f0 + + +def librosa_pad_lr(x, fshift, pad_sides=1): + '''compute right padding (final frame) or both sides padding (first and final frames) + ''' + assert pad_sides in (1, 2) + # return int(fsize // 2) + pad = (x.shape[0] // fshift + 1) * fshift - x.shape[0] + if pad_sides == 1: + return 0, pad + else: + return pad // 2, pad//2 + pad%2 + + +def get_pitch( + wav_file: Union[str, Path], sample_rate: int, frame_shift: float +): + import parselmouth + hop_size = int(frame_shift * sample_rate) + wav, _ = librosa.core.load(wav_file, sr=sample_rate) + # l_pad, r_pad = librosa_pad_lr(wav, hop_size, 1) + # wav = np.pad(wav, (l_pad, r_pad), mode='constant', constant_values=0.0) + + latent_length = wav.shape[0] // hop_size + f0_min = 80 + f0_max = 750 + pad_size = 4 + + f0 = parselmouth.Sound(wav, sample_rate).to_pitch_ac( + time_step=frame_shift, + voicing_threshold=0.6, + pitch_floor=f0_min, + pitch_ceiling=f0_max + ).selected_array['frequency'] + delta_l = latent_length - len(f0) + if delta_l > 0: + f0 = np.concatenate([f0, [f0[-1]] * delta_l], 0) + pitch_coarse = f0_to_coarse(f0) + return f0, pitch_coarse + + +def remove_empty_lines(text): + """remove empty lines""" + assert (len(text) > 0) + assert (isinstance(text, list)) + text = [t.strip() for t in text] + if "" in text: + text.remove("") + return text + + +def is_sil_phoneme(p): + return not p[0].isalpha() + + +def strip_ids(ids, ids_to_strip): + """Strip ids_to_strip from the end ids.""" + ids = list(ids) + while ids and ids[-1] in ids_to_strip: + ids.pop() + return ids + + +class TextEncoder(object): + """Base class for converting from ints to/from human readable strings.""" + def __init__(self, num_reserved_ids=NUM_RESERVED_TOKENS): + self._num_reserved_ids = num_reserved_ids + + @property + def num_reserved_ids(self): + return self._num_reserved_ids + + def encode(self, s): + """Transform a human-readable string into a sequence of int ids. + + The ids should be in the range [num_reserved_ids, vocab_size). Ids [0, + num_reserved_ids) are reserved. + + EOS is not appended. + + Args: + s: human-readable string to be converted. + + Returns: + ids: list of integers + """ + return [int(w) + self._num_reserved_ids for w in s.split()] + + def decode(self, ids, strip_extraneous=False): + """Transform a sequence of int ids into a human-readable string. + + EOS is not expected in ids. + + Args: + ids: list of integers to be converted. + strip_extraneous: bool, whether to strip off extraneous tokens + (EOS and PAD). + + Returns: + s: human-readable string. + """ + if strip_extraneous: + ids = strip_ids(ids, list(range(self._num_reserved_ids or 0))) + return " ".join(self.decode_list(ids)) + + def decode_list(self, ids): + """Transform a sequence of int ids into a their string versions. + + This method supports transforming individual input/output ids to their + string versions so that sequence to/from text conversions can be visualized + in a human readable format. + + Args: + ids: list of integers to be converted. + + Returns: + strs: list of human-readable string. + """ + decoded_ids = [] + for id_ in ids: + if 0 <= id_ < self._num_reserved_ids: + decoded_ids.append(RESERVED_TOKENS[int(id_)]) + else: + decoded_ids.append(id_ - self._num_reserved_ids) + return [str(d) for d in decoded_ids] + + @property + def vocab_size(self): + raise NotImplementedError() + + +class TokenTextEncoder(TextEncoder): + """Encoder based on a user-supplied vocabulary (file or list).""" + def __init__( + self, + vocab_filename, + reverse=False, + vocab_list=None, + replace_oov=None, + num_reserved_ids=NUM_RESERVED_TOKENS + ): + """Initialize from a file or list, one token per line. + + Handling of reserved tokens works as follows: + - When initializing from a list, we add reserved tokens to the vocab. + - When initializing from a file, we do not add reserved tokens to the vocab. + - When saving vocab files, we save reserved tokens to the file. + + Args: + vocab_filename: If not None, the full filename to read vocab from. If this + is not None, then vocab_list should be None. + reverse: Boolean indicating if tokens should be reversed during encoding + and decoding. + vocab_list: If not None, a list of elements of the vocabulary. If this is + not None, then vocab_filename should be None. + replace_oov: If not None, every out-of-vocabulary token seen when + encoding will be replaced by this string (which must be in vocab). + num_reserved_ids: Number of IDs to save for reserved tokens like . + """ + super(TokenTextEncoder, + self).__init__(num_reserved_ids=num_reserved_ids) + self._reverse = reverse + self._replace_oov = replace_oov + if vocab_filename: + self._init_vocab_from_file(vocab_filename) + else: + assert vocab_list is not None + self._init_vocab_from_list(vocab_list) + self.pad_index = self._token_to_id[PAD] + self.eos_index = self._token_to_id[EOS] + self.unk_index = self._token_to_id[UNK] + self.seg_index = self._token_to_id[ + SEG] if SEG in self._token_to_id else self.eos_index + + def encode(self, s): + """Converts a space-separated string of tokens to a list of ids.""" + sentence = s + tokens = sentence.strip().split() + if self._replace_oov is not None: + tokens = [ + t if t in self._token_to_id else self._replace_oov + for t in tokens + ] + ret = [self._token_to_id[tok] for tok in tokens] + return ret[::-1] if self._reverse else ret + + def decode(self, ids, strip_eos=False, strip_padding=False): + if strip_padding and self.pad() in list(ids): + pad_pos = list(ids).index(self.pad()) + ids = ids[:pad_pos] + if strip_eos and self.eos() in list(ids): + eos_pos = list(ids).index(self.eos()) + ids = ids[:eos_pos] + return " ".join(self.decode_list(ids)) + + def decode_list(self, ids): + seq = reversed(ids) if self._reverse else ids + return [self._safe_id_to_token(i) for i in seq] + + @property + def vocab_size(self): + return len(self._id_to_token) + + def __len__(self): + return self.vocab_size + + def _safe_id_to_token(self, idx): + return self._id_to_token.get(idx, "ID_%d" % idx) + + def _init_vocab_from_file(self, filename): + """Load vocab from a file. + + Args: + filename: The file to load vocabulary from. + """ + with open(filename) as f: + tokens = [token.strip() for token in f.readlines()] + + def token_gen(): + for token in tokens: + yield token + + self._init_vocab(token_gen(), add_reserved_tokens=False) + + def _init_vocab_from_list(self, vocab_list): + """Initialize tokens from a list of tokens. + + It is ok if reserved tokens appear in the vocab list. They will be + removed. The set of tokens in vocab_list should be unique. + + Args: + vocab_list: A list of tokens. + """ + def token_gen(): + for token in vocab_list: + if token not in RESERVED_TOKENS: + yield token + + self._init_vocab(token_gen()) + + def _init_vocab(self, token_generator, add_reserved_tokens=True): + """Initialize vocabulary with tokens from token_generator.""" + + self._id_to_token = {} + non_reserved_start_index = 0 + + if add_reserved_tokens: + self._id_to_token.update(enumerate(RESERVED_TOKENS)) + non_reserved_start_index = len(RESERVED_TOKENS) + + self._id_to_token.update( + enumerate(token_generator, start=non_reserved_start_index) + ) + + # _token_to_id is the reverse of _id_to_token + self._token_to_id = dict((v, k) + for k, v in six.iteritems(self._id_to_token)) + + def pad(self): + return self.pad_index + + def eos(self): + return self.eos_index + + def unk(self): + return self.unk_index + + def seg(self): + return self.seg_index + + def store_to_file(self, filename): + """Write vocab file to disk. + + Vocab files have one token per line. The file ends in a newline. Reserved + tokens are written to the vocab file as well. + + Args: + filename: Full path of the file to store the vocab to. + """ + with open(filename, "w") as f: + for i in range(len(self._id_to_token)): + f.write(self._id_to_token[i] + "\n") + + def sil_phonemes(self): + return [p for p in self._id_to_token.values() if not p[0].isalpha()] + + +class TextGrid(object): + def __init__(self, text): + text = remove_empty_lines(text) + self.text = text + self.line_count = 0 + self._get_type() + self._get_time_intval() + self._get_size() + self.tier_list = [] + self._get_item_list() + + def _extract_pattern(self, pattern, inc): + """ + Parameters + ---------- + pattern : regex to extract pattern + inc : increment of line count after extraction + Returns + ------- + group : extracted info + """ + try: + group = re.match(pattern, self.text[self.line_count]).group(1) + self.line_count += inc + except AttributeError: + raise ValueError( + "File format error at line %d:%s" % + (self.line_count, self.text[self.line_count]) + ) + return group + + def _get_type(self): + self.file_type = self._extract_pattern(r"File type = \"(.*)\"", 2) + + def _get_time_intval(self): + self.xmin = self._extract_pattern(r"xmin = (.*)", 1) + self.xmax = self._extract_pattern(r"xmax = (.*)", 2) + + def _get_size(self): + self.size = int(self._extract_pattern(r"size = (.*)", 2)) + + def _get_item_list(self): + """Only supports IntervalTier currently""" + for itemIdx in range(1, self.size + 1): + tier = OrderedDict() + item_list = [] + tier_idx = self._extract_pattern(r"item \[(.*)\]:", 1) + tier_class = self._extract_pattern(r"class = \"(.*)\"", 1) + if tier_class != "IntervalTier": + raise NotImplementedError( + "Only IntervalTier class is supported currently" + ) + tier_name = self._extract_pattern(r"name = \"(.*)\"", 1) + tier_xmin = self._extract_pattern(r"xmin = (.*)", 1) + tier_xmax = self._extract_pattern(r"xmax = (.*)", 1) + tier_size = self._extract_pattern(r"intervals: size = (.*)", 1) + for i in range(int(tier_size)): + item = OrderedDict() + item["idx"] = self._extract_pattern(r"intervals \[(.*)\]", 1) + item["xmin"] = self._extract_pattern(r"xmin = (.*)", 1) + item["xmax"] = self._extract_pattern(r"xmax = (.*)", 1) + item["text"] = self._extract_pattern(r"text = \"(.*)\"", 1) + item_list.append(item) + tier["idx"] = tier_idx + tier["class"] = tier_class + tier["name"] = tier_name + tier["xmin"] = tier_xmin + tier["xmax"] = tier_xmax + tier["size"] = tier_size + tier["items"] = item_list + self.tier_list.append(tier) + + def toJson(self): + _json = OrderedDict() + _json["file_type"] = self.file_type + _json["xmin"] = self.xmin + _json["xmax"] = self.xmax + _json["size"] = self.size + _json["tiers"] = self.tier_list + return json.dumps(_json, ensure_ascii=False, indent=2) + + +def read_duration_from_textgrid( + textgrid_path: Union[str, Path], + phoneme: str, + utterance_duration: float, +): + ph_list = phoneme.split(" ") + with open(textgrid_path, "r") as f: + textgrid = f.readlines() + textgrid = remove_empty_lines(textgrid) + textgrid = TextGrid(textgrid) + textgrid = json.loads(textgrid.toJson()) + + split = np.ones(len(ph_list) + 1, np.float) * -1 + tg_idx = 0 + ph_idx = 0 + tg_align = [x for x in textgrid['tiers'][-1]['items']] + tg_align_ = [] + for x in tg_align: + x['xmin'] = float(x['xmin']) + x['xmax'] = float(x['xmax']) + if x['text'] in ['sil', 'sp', '', 'SIL', 'PUNC']: + x['text'] = '' + if len(tg_align_) > 0 and tg_align_[-1]['text'] == '': + tg_align_[-1]['xmax'] = x['xmax'] + continue + tg_align_.append(x) + tg_align = tg_align_ + tg_len = len([x for x in tg_align if x['text'] != '']) + ph_len = len([x for x in ph_list if not is_sil_phoneme(x)]) + assert tg_len == ph_len, (tg_len, ph_len, tg_align, ph_list, textgrid_path) + while tg_idx < len(tg_align) or ph_idx < len(ph_list): + if tg_idx == len(tg_align) and is_sil_phoneme(ph_list[ph_idx]): + split[ph_idx] = 1e8 + ph_idx += 1 + continue + x = tg_align[tg_idx] + if x['text'] == '' and ph_idx == len(ph_list): + tg_idx += 1 + continue + assert ph_idx < len(ph_list), ( + tg_len, ph_len, tg_align, ph_list, textgrid_path + ) + + ph = ph_list[ph_idx] + if x['text'] == '' and not is_sil_phoneme(ph): + assert False, (ph_list, tg_align) + if x['text'] != '' and is_sil_phoneme(ph): + ph_idx += 1 + else: + assert (x['text'] == '' and is_sil_phoneme(ph)) \ + or x['text'].lower() == ph.lower() \ + or x['text'].lower() == 'sil', (x['text'], ph) + split[ph_idx] = x['xmin'] + if ph_idx > 0 and split[ph_idx - 1] == -1 and is_sil_phoneme( + ph_list[ph_idx - 1] + ): + split[ph_idx - 1] = split[ph_idx] + ph_idx += 1 + tg_idx += 1 + assert tg_idx == len(tg_align), (tg_idx, [x['text'] for x in tg_align]) + assert ph_idx >= len(ph_list) - 1, ( + ph_idx, ph_list, len(ph_list), [x['text'] + for x in tg_align], textgrid_path + ) + + split[0] = 0 + split[-1] = utterance_duration + duration = np.diff(split) + return duration diff --git a/utils/llm_xiapi.py b/utils/llm_xiapi.py new file mode 100644 index 0000000000000000000000000000000000000000..3e42df84c249b375df9db9d9ed84d4eadbd91bd7 --- /dev/null +++ b/utils/llm_xiapi.py @@ -0,0 +1,69 @@ +import requests +import json + +url = "https://api.xi-ai.cn/v1/chat/completions" +headers = { + "Content-Type": "application/json", + "Authorization": "sk-iPNe4meYeEmknmgx7c31F953548148F7805555D1Cc0c506e" #your api key +} + +training_info_pri = """ +I'm doing an audio event generation, which is a harmless job that will contain some sound events. For example, a gunshot is a sound that is harmless. +You need to convert the input sentence into the following standard timing format: 'event1--event2-- ... --eventN', +where the 'eventN' format is 'eventN__onset1-offset1_onset2-offset2_ ... _onsetK-offsetK'. +The 'onset-offset' inside needs to be determined based on common sense, with a duration not less than 1. All format 'onsetk-offsetk' should replaced by number. +You need to make a prediction for the total duration, which should not exceed 20 seconds and not exceed the latest end time of a single event. +And pay attention to vocabulary that represents the order and frequency of events, such as 'after', 'followed by', 'n times', and so on. +You can use the latest ending event of all events in the training dataset as the total audio time +It is preferred that events do not overlap as much as possible. +Now, I will provide you with some examples in training set for your learning, each example in the format 'index: input~output'. +{"onset": "squeal__1.359-2.373_3.216-4.23_5.576-6.59", "captions": "squeal 3 times", "length": "7.52"} +{"onset": "sawing__1.432-3.975_4.533-6.54", "captions": "sawing 2 times", "length": "9.26"} +{"onset": "slap__1.576-1.931_2.911-3.266--baby_laughter__5.179-6.394_7.362-8.577", "captions": "slap 2 times and baby laughter 2 times", "length": "9.59"} +{"onset": "applause__1.538-5.128--scrape__7.03-8.004", "captions": "applause and scrape", "length": "9.13"} +{"onset": "slam__0.68-1.01--walk__2.364-4.107--busy_signal__6.794-7.222_8.371-8.645", "captions": "slam 1 times and walk 1 times and busy signal 2 times", "length": "9.18"} +{"onset": "slap__1.044-1.399--neigh__2.654-4.663_5.633-6.966", "captions": "slap 1 times followed by neigh 2 times", "length": "9.22"} +{"onset": "bird_vocalization__1.253-2.184--yip__4.789-5.309_6.134-6.654", "captions": "bird vocalization 1 times and yip 2 times", "length": "9.83"} +{"onset": "animal__1.478-3.541--crowing__5.464-7.11", "captions": "animal then crowing", "length": "9.45"} +{"onset": "crying__0.999-7.773", "captions": "crying", "length": "9.48"} +{"onset": "cricket__1.629-4.983", "captions": "cricket 1 times", "length": "5.87"} +{"onset": "fireworks__1.336-2.477--car__4.193-6.649", "captions": "car after fireworks", "length": "9.7"} +""" +training_info_post = """ +It is worth noting that you should judge both the duration of a single event and the total duration based on experience and the examples I provided. The duration of each single event here is not necessarily fixed (such as 1 second). +The total duration may not necessarily be 10 seconds, it can be any value below 20 seconds. you should give me the answer as {"onset":" ","captions": " ", "length": " "}' +""" + +def get_time_info(caption): + prompt = ( + f"{training_info_pri}\n" + f'Now,you can transform "captions":\n' + f'"{caption}"\n' + f"{training_info_post}" + ) + data = { + "model": "gpt-5-mini", + "stream": False, + "messages": [ + { + "role": "system", + "content": "You are a helpful assistant." + }, + { + "role": "user", + "content": prompt + } + ] + } + response = requests.post(url, headers=headers, json=data) + if response.status_code == 200: + print(response.json()['choices'][0]['message']['content']) + return response.json()['choices'][0]['message']['content'] + else: + print(f"Error: {response.status_code}, {response.text}") + return None + +if __name__ == "__main__": + caption = "a dog barks followed by a cat meows 2 times" + result = get_time_info(caption) + print(result) \ No newline at end of file diff --git a/utils/log_helper.py b/utils/log_helper.py new file mode 100644 index 0000000000000000000000000000000000000000..bcb011c02dca7c42c9b387c4f126ba3e9fe7efb4 --- /dev/null +++ b/utils/log_helper.py @@ -0,0 +1,23 @@ +from pathlib import Path +from dataclasses import dataclass +import logging + + +@dataclass +class LoggingLogger: + + filename: str | Path + level: str = "INFO" + + def create_instance(self, ): + filename = self.filename.__str__() + formatter = logging.Formatter("[%(asctime)s] - %(message)s") + + logger = logging.getLogger(__name__ + "." + filename) + logger.setLevel(getattr(logging, self.level)) + + file_handler = logging.FileHandler(filename) + file_handler.setFormatter(formatter) + logger.addHandler(file_handler) + + return logger diff --git a/utils/lr_scheduler_utilities.py b/utils/lr_scheduler_utilities.py new file mode 100644 index 0000000000000000000000000000000000000000..6bd0333a6c51bbc1f307213eced5d34b4b906d22 --- /dev/null +++ b/utils/lr_scheduler_utilities.py @@ -0,0 +1,154 @@ +from typing import Any +import math +import copy +from torch.utils.data import DataLoader + + +def get_warmup_steps( + dataloader_one_pass_outside_steps: int, + warmup_steps: int | None = None, + warmup_epochs: float | None = None, + epoch_length: int | None = None, +) -> int: + """ + Derive warmup steps according to step number or epoch number. + If `warmup_steps` is provided, then just return it. Otherwise, derive + the warmup steps by epoch length and warmup epoch number. + """ + if warmup_steps is not None: + return warmup_steps + else: + if epoch_length is None: + epoch_length = dataloader_one_pass_outside_steps + assert warmup_epochs is not None, "warmup_steps and warmup_epochs cannot be both None" + return int(epoch_length * warmup_epochs) + + +def get_dataloader_one_pass_outside_steps( + train_dataloader: DataLoader, + num_processes: int = 1, +): + """ + dataloader length after DDP, close to `original_length / gpu_number` + """ + return math.ceil(len(train_dataloader) / num_processes) + + +def get_total_training_steps( + train_dataloader: DataLoader, + epochs: int, + num_processes: int = 1, + epoch_length: int | None = None +): + """ + Calculate the total number of "visible" training steps. + + If `epoch_length` is provided, it is used as the fixed length for each epoch. + Otherwise, the function will determine the epoch length from `train_dataloader`. + + Args: + train_dataloader: + Training dataloader object. + epochs: + The total number of epochs to run. + num_processes: + The number of parallel processes used for distributed training. + epoch_length: + A fixed number of training steps for each epoch. Defaults to None. + + Returns: + int: The total number of training steps (i.e., `epochs * epoch_length`). + """ + # `epoch_length` is not None: fixed length for each epoch + if epoch_length is None: + # `epoch_length` is the length of DDP-wrapped `train_dataloader` + epoch_length = get_dataloader_one_pass_outside_steps( + train_dataloader, num_processes + ) + return epochs * epoch_length + + +def get_dataloader_one_pass_steps_inside_accelerator( + dataloader_one_pass_steps: int, gradient_accumulation_steps: int, + num_processes: int +): + """ + Calculate the number of "visible" training steps for a single pass over the dataloader + inside an accelerator, accounting for gradient accumulation and distributed training. + + + Args: + dataloader_one_pass_steps: + The number of steps (batches) in one pass over the dataset. + gradient_accumulation_steps: + The number of steps to accumulate gradients before performing a parameter update. + num_processes: + The number of parallel processes used for distributed training. + + Returns: + int: The total number of "visible" training steps for one pass over the dataset, + multiplied by the number of processes. + """ + return math.ceil( + dataloader_one_pass_steps / gradient_accumulation_steps + ) * num_processes + + +def get_steps_inside_accelerator_from_outside_steps( + outside_steps: int, dataloader_one_pass_outside_steps: int, + dataloader_one_pass_steps_inside_accelerator: int, + gradient_accumulation_steps: int, num_processes: int +): + """ + Convert "outside" steps (as observed in wandb logger or similar context) + to the corresponding number of "inside" steps (for accelerate lr scheduler). + + Specifically, accelerate lr scheduler call `step()` `num_processes` times for + every `gradient_accumulation_steps` outside steps. + + Args: + outside_steps: + The total number of steps counted outside accelerate context. + dataloader_one_pass_outside_steps: + The number of steps (batches) to complete one pass of the dataloader + outside accelerate. + dataloader_one_pass_steps_inside_accelerator: + The number of `lr_scheduler.step()` calls inside accelerate, calculated via + `get_dataloader_one_pass_steps_inside_accelerator`. + gradient_accumulation_steps: + The number of steps to accumulate gradients. + num_processes: + The number of parallel processes (GPUs) used in distributed training. + + Returns: + int: The total number of `lr_scheduler.step()` calls inside accelerate that + correspond to the given `outside_steps`. + """ + num_dataloader_epochs_passed = outside_steps // dataloader_one_pass_outside_steps + remaining_outside_steps = outside_steps % dataloader_one_pass_outside_steps + remaining_inside_accelerator_steps = ( + remaining_outside_steps // gradient_accumulation_steps * num_processes + ) + # accelerate scheduler call `step()` `num_processes` times every + # `gradient_accumulation_steps` steps: + # https://github.com/huggingface/accelerate/blob/main/src/accelerate/scheduler.py#L76 + total_steps = ( + num_dataloader_epochs_passed* + dataloader_one_pass_steps_inside_accelerator + + remaining_inside_accelerator_steps + ) + return total_steps + + +def lr_scheduler_param_adapter( + config_dict: dict[str, Any], num_training_steps: int, num_warmup_steps: int +) -> dict[str, Any]: + target_class = config_dict["_target_"] + return_dict = copy.deepcopy(config_dict) + if target_class == "transformers.get_scheduler": + return_dict.update({ + "num_training_steps": num_training_steps, + "num_warmup_steps": num_warmup_steps + }) + + return return_dict diff --git a/utils/merge_content.json b/utils/merge_content.json new file mode 100644 index 0000000000000000000000000000000000000000..de31715833115d135c88298ab934a010e62e624b --- /dev/null +++ b/utils/merge_content.json @@ -0,0 +1,6196 @@ +[ + { + "event": "accelerating", + "phrases": [ + "accelerating", + "accelerating revving", + "accelerating vehicle", + "accelerating engine", + "accelerating vehicle engine", + "vehicle motor accelerating and deaccelerating", + "increased revving", + "engine accelerating", + "accelerating and revving", + "an engine decelerates then accelerates", + "revving vehicle", + "accelerating a vehicle", + "a revving", + "reving", + "a series of accelerating", + "an engine revving and then slowing", + "accelerating motor", + "engine accelerating and deccelerating", + "accelerating engines", + "an engine gradually revs down", + "engine revving", + "engine accelerating rapidly", + "engine motor revving", + "engine acceleration", + "acceleration", + "engine accelerating then slowing down", + "an even stronger revving", + "accelerating car", + "a vehicle engine accelerating at a rapid rate", + "The engine roared as the vehicle picked up speed", + "A deep, powerful hum filled the air as acceleration began", + "The sound of the motor intensified as the vehicle surged forward", + "A sharp, rising whirring noise accompanied the vehicle's acceleration", + "A steady increase in engine noise signaled the vehicle speeding up" + ] + }, + { + "event": "air_horn", + "phrases": [ + "air horns", + "air horns blare in a series", + "air horn", + "airhorn", + "air horn sounds", + "horns sound", + "air horns alternate", + "air horns blare twice", + "air horn sound", + "air horns sound", + "air horns blow", + "a brief, loud horn", + "a horn blows in different patterns and tunes", + "air raid siren with delay", + "a sudden series of a horn blowing loudly", + "A loud, piercing blast echoed through the air", + "The deep, resonant honk of an air horn cut through the noise", + "A sudden blaring sound burst out, commanding attention", + "A booming horn sound reverberated with authority", + "The unmistakable blast of an air horn signaled its presence" + ] + }, + { + "event": "aircraft", + "phrases": [ + "aircraft", + "aircraft fly", + "aircraft motor", + "aircraft noise", + "aircraft engine", + "an aircraft approaches", + "aircraft flies", + "aircraft approach", + "aircrafts", + "an aircraft passes", + "fixed-wing aircraft fly overhead", + "an aircraft propeller", + "an aircraft flies by", + "an aircraft moves fast", + "airplane engine", + "an aircraft engine goes by", + "aircraft sounds", + "an aircraft engine roars by", + "an aircraft approaches and passes", + "a propeller aircraft flies by loudly", + "an aircraft engine stuttering", + "an aircraft engine swirls in the background", + "large aircraft taking off", + "an aircraft engine gets louder as it approaches", + "another aircraft engine passing by", + "an aircraft passes by loudly", + "an aircraft is performing a flyby", + "an aircraft engine", + "an aircraft engine approaching closer", + "A distant, steady hum of an aircraft filled the sky", + "The faint roar of engines could be heard overhead", + "A low-frequency rumble signified an aircraft passing by", + "The rhythmic drone of an aircraft traveled across the atmosphere", + "A continuous engine noise hovered in the background, marking flight" + ] + }, + { + "event": "aircraft_engine", + "phrases": [ + "aircraft engine sound", + "aircraft approach", + "airplane", + "air propellers", + "prop engine sound", + "airplanes", + "airoplane sound", + "a plane's engine starts", + "aircraft noise", + "aircraft", + "airplane starting roughly", + "aircraft play", + "plane takes off", + "helicopter", + "a voice of idling aircraft engine", + "aircraft fly", + "aircraft flies", + "engine of airplane taking off", + "jets", + "aircraft engine noise", + "airplane engine", + "A powerful, steady engine roar resonated as the aircraft operated", + "The deep and consistent hum of an aircraft engine was distinct", + "A mechanical whirring filled the air as the engine powered up", + "A steady droning noise emanated from the aircraft's engine", + "The unmistakable sound of an aircraft engine dominated the atmosphere" + ] + }, + { + "event": "alarm", + "phrases": [ + "alarm", + "an alarm", + "alarms", + "futuristic alarm sound", + "a warning alarm", + "alarm sound from an energy cube vault", + "an alarm sound is being synthesized with reverb and panning", + "a distress alarm", + "alarm sound made with feedback loop", + "a composite alarmsample is being looped", + "synthesized alarm sound", + "funky alarm sound", + "alarm blare", + "an alarm from space", + "an emergency alarm", + "a composite alarm sample is playing", + "security system alarm", + "synthesized alarm tone", + "alarm with repetition", + "an alarm is ringing in a media library", + "another alarm", + "an alarm for spaceships or robots", + "a shrill back-up alarm rings", + "setting off an alarm repeatedly", + "more alarms", + "an alarm-like soundscape", + "an alarm beeps loudly multiple times", + "A loud, urgent ringing sound pierced through the environment", + "The stark wail of an alarm signaled potential danger", + "A repetitive beeping noise filled the air as the alarm activated", + "An insistent, high-pitched alarm tone demanded immediate attention", + "The shrill sound of an alarm broke the silence" + ] + }, + { + "event": "alarm_clock", + "phrases": [ + "alarm clock", + "alarm clock is put down and continues playing", + "an alarm clock goes off repeatedly", + "an alarm clock goes off intermittently", + "alarm clocks", + "loud alarm clock sound effect", + "an alarm clock goes off with mechanical sounds", + "an alarm clock is heard repeatedly", + "alarm clocks buzz repetitively", + "an alarm clock goes off and later stops", + "an alarm clock is ringing repeatedly", + "an alarm clock beeping continuously", + "an alarm clock is ringing continuously", + "multiple alarm clocks", + "an alarm clock", + "an alarm is ringing continuously and repeatedly", + "alarm clocks are heard repeatedly", + "crossing alarm sounding", + "an alarm clock rings repeatedly", + "alarm clock is blaring loudly", + "an alarm clock beeps repeatedly", + "an alarm clock beeping a tune", + "a clock sounds an alarm", + "a wrist watch alarm", + "an alarm clock sounds intermittently", + "various alarms", + "an alarm is ringing from a clock radio", + "an alarm clock is ticking repeatedly", + "an alarm clock sounds repeatedly", + "alarm clock sounds", + "A sharp, repetitive beeping sound signaled the start of the day", + "A rhythmic ringing noise came from the alarm clock", + "The consistent chime of an alarm clock filled the room", + "A sudden burst of sound from the alarm clock broke the quiet", + "The insistent sound of an alarm clock urged action" + ] + }, + { + "event": "ambulance_(siren)", + "phrases": [ + "a two-tone emergency vehicle siren blows as it approaches, passes by, and then fades into the distance", + "a couple of sirens blaring one after the other", + "sirens ring as they approach", + "several sirens, sounding at different frequencies", + "a siren slowly gets louder as it approaches", + "an emergency vehicle siren blows, approaching and then fading", + "ambulance siren", + "an emergency vehicle blasts a series of sirens", + "an emergency vehicle siren wails and echoes", + "a siren getting louder as it approaches", + "an emergency vehicle approaches with its siren blaring", + "emergency sirens are blaring as they approach and pass by", + "loud siren gets closer and then gets further away", + "a series of emergency sirens sounding", + "loud, continuous ambulance sirens", + "several different ambulance sirens are triggered subsequently", + "an emergency vehicle siren wails in different patterns", + "an ambulance sounds its siren", + "ambulance sirens", + "different types of ambulance vehicle sirens are blaring in turn", + "an ambulance siren is triggered and moves to get closer", + "an ambulance siren is triggered and moves to get closer and then further", + "an ambulance siren receding as the ambulance drives away", + "an ambulance's siren", + "A wailing siren sound echoed in the distance", + "The oscillating pitch of a siren signaled an ambulance approaching", + "A high-pitched siren sound cut through the air, alerting others", + "The urgent, alternating tones of an ambulance siren were unmistakable", + "A rapid, piercing siren sound announced the ambulance's presence" + ] + }, + { + "event": "animal", + "phrases": [ + "animal noise", + "the sounds of an animal", + "animal sound", + "the sounds of animals", + "animal sound effects", + "an animal", + "animal sounds", + "animal sounds outside", + "animal contact sounds", + "an animal sound effect", + "animal calling", + "animal noises", + "animal sounds occur", + "animal roars", + "another animal", + "A series of chirping noises filled the environment", + "The low growl of an animal echoed softly", + "Distinct animal calls could be heard in the distance", + "Various animal vocalizations added a natural melody to the surroundings", + "The continuous chatter of animals created a lively ambiance" + ] + }, + { + "event": "applause", + "phrases": [ + "applause", + "applauding", + "applause from audience", + "applause erupts", + "small audience ovation", + "general applause", + "crowd applauses", + "canned applause", + "long applause", + "crowd applauds", + "crowd applause", + "audience applause", + "cheering", + "crowd applausing", + "crowd applaud", + "clapping audience", + "applause amid crowds", + "cheering with applause", + "applauses", + "audience applauds", + "applause breaks out", + "claps", + "crowd claps", + "enthusiastic applause", + "polite applause", + "clapping begins", + "applause sounds", + "clapping hands", + "audience gives applause", + "applause and clapping", + "A rhythmic clapping sound erupted from the crowd", + "The sound of multiple hands clapping echoed in the space", + "A burst of applause broke out, filling the air with energy", + "The collective clapping noise created a wave of enthusiasm", + "A loud ovation of clapping resonated through the area" + ] + }, + { + "event": "artillery_fire", + "phrases": [ + "artillery fire", + "artillery", + "artillery fires", + "artillery fire erupts", + "artillery starts firing", + "artillery guns fire", + "artillery firing", + "artillery fire booms", + "artillery rounds fire", + "artillery cannons firing", + "artillery cannons firing several times", + "artillery is launched", + "artillery fire goes off", + "artillery cannons firing several times with an echo", + "artillery sounds", + "artillery fire occurs", + "artillery is fired in the distance", + "artillery fire rings out", + "artillery fire sounds", + "artillery fire occurs once", + "artillery fires in the background", + "A loud, booming explosion echoed through the air", + "A deep, resounding blast marked the firing of artillery", + "A sharp, cracking sound accompanied the artillery discharge", + "The thunderous sound of artillery fire shook the surroundings", + "A distant rumble signaled the power of artillery in action" + ] + }, + { + "event": "baby_cry", + "phrases": [ + "infant cries", + "an infant cries continously", + "baby cry", + "an infant is crying loudly and persistently", + "an infant cries repeatedly and loudly", + "a baby is crying very deeply in way that reverberates", + "infants cry", + "a baby cries over and over", + "an infant crying consistently", + "cry", + "baby crying sounds", + "a baby crying repeatedly and loudly", + "an infant cries continuously", + "baby cries", + "a baby continuously cries", + "young infant crying hard", + "two infants cry together", + "a young infant cries for a short while", + "an infant is crying continuously", + "an infant cries funnily", + "baby upset", + "baby is crying during birth", + "an infant crying continuously", + "an infant cries loudly and harshly", + "an infant cries repeatedly", + "infant screaming", + "a baby crying unceasingly", + "an infant cries repeatedly and softly", + "a crying infant", + "a small infant cries repeatedly", + "A high-pitched, wailing sound of a distressed infant filled the air", + "The sharp, repetitive crying of a baby was unmistakable", + "A soft whimper escalated into full-blown crying", + "The piercing sound of an infant crying echoed through the space", + "A rhythmic bawling sound indicated a baby's need for attention" + ] + }, + { + "event": "baby_laughter", + "phrases": [ + "baby laughs", + "baby jabbering", + "baby laugh", + "infant laughs", + "infant jabbering", + "infant laugh", + "infants laughter", + "infant laughter", + "baby laughter", + "baby gurgling laughter", + "an infant laughs continuously", + "baby laughing sound", + "infant laughing sound", + "two infants laugh", + "a baby giggle sporadically", + "a baby laughs loudly and frequently", + "A series of high-pitched giggles from a baby rang out joyfully", + "The bubbly laughter of an infant filled the room with happiness", + "A soft, melodic chuckle from a baby brought a sense of warmth", + "A baby's contagious laughter spread through the air", + "The cheerful giggling of a baby created a lively atmosphere" + ] + }, + { + "event": "bark", + "phrases": [ + "bark", + "real dog bark", + "dog bark", + "dog barks", + "barks", + "a dog barks four times", + "dog barks twice", + "barking", + "two dogs bark", + "barking dog", + "a dog barks over and over", + "a dog bark echos", + "a dog barks multiple times", + "a dog barks sharply", + "a dog barks multiple times loudly nearby", + "a series of dog barks", + "a bark intermittently", + "dog barks loudly", + "aggressive dog barking", + "big dog barking", + "dogs bow-wow", + "an angry big dog is barking", + "a dog barks furiously", + "the dog barking four times", + "dog barking repeatedly", + "a dog barks several times", + "a dog barks quickly several times", + "dog barking", + "a dog barks urgently", + "the dog inside barks two deliberate barks", + "A sharp, loud bark echoed through the area", + "The repetitive woof of a dog broke the silence", + "A low growl transitioned into a firm bark", + "The distinct sound of a dog's bark signaled its presence", + "A series of short, sharp barks conveyed urgency" + ] + }, + { + "event": "bee", + "phrases": [ + "bee sounds", + "bees buzz", + "bees buzz sounds", + "bee sound", + "several bees fly", + "bees", + "bees are buzzing", + "bees buzzing", + "bees fly", + "a bee", + "bee sound", + "a bee is buzzing", + "a bee is flying", + "a bee flies", + "bees fly", + "bees are lightly buzzing", + "bee fly sound", + "bee flying sound", + "a few bees fly nearby", + "a few bees buzz around", + "bees swarm loudly nearby", + "bees buzzing faintly", + "bees flying faintly", + "bees fly in the distance", + "bees whining", + "bees are flying around", + "flying bees", + "A soft, continuous buzzing sound filled the air", + "The distinctive hum of bees at work was audible nearby", + "A faint buzzing grew louder as the bees approached", + "The rhythmic buzz of bees created a natural melody", + "The gentle hum of bees added a sense of activity to the surroundings" + ] + }, + { + "event": "beep", + "phrases": [ + "beep", + "bleep", + "a warning beep", + "a device beep", + "a quick beep", + "an electronic beep", + "broken beep", + "an electronic bleep beeping three times", + "quick beep", + "an electronic bleep beeping once", + "a mechanical beep", + "error beep", + "a large beep", + "item beeping", + "a beeping", + "digital beeping", + "a second beep", + "an electronic bleep beeping repeatedly", + "a small digital beep", + "longer beep", + "a short beep", + "a beep repeats multiple times", + "single beep", + "a loud digital beep", + "beep beep beep", + "beeps", + "a digital beep repeating", + "a digital beeping", + "a beep repeats", + "beeping", + "A short, high-pitched beep broke the stillness", + "The repetitive beeping of a machine was clearly audible", + "A single, sharp beep indicated a completed action", + "The consistent tone of a beep filled the background", + "An intermittent beeping sound captured attention" + ] + }, + { + "event": "bell", + "phrases": [ + "bell", + "bell ring", + "bell toll sound", + "station bell", + "bells", + "meditation bells", + "small bell", + "a bell ding", + "a hand bell is striking the note c#/db", + "single bell beat", + "bell effects", + "a single hand bell strike", + "ring bell sounds", + "a metal bell ding", + "a bell-like resonance with few overtones", + "a warning bell", + "bells ring outside", + "a loud bell", + "a hand bell is being struck with the note A", + "a bell rings", + "bell sounds", + "a bell chimes loudly", + "a warning bell dings", + "a bell", + "a bell rings out", + "bell rings", + "a bell sound", + "bell dings", + "bells ring out in a melody", + "A clear, resonant ringing sound of a bell reverberated", + "A soft chime of a bell broke the silence", + "The metallic ringing of a bell echoed through the surroundings", + "A single, crisp bell tone signified an event", + "The rhythmic tolling of a bell filled the air" + ] + }, + { + "event": "bicycle_bell", + "phrases": [ + "sound of a bike bell", + "a bike bell", + "bicycle bell", + "a bicycle bell", + "a bike bell rings", + "a bicycle bell is sounding off", + "bike bell is ringing", + "bike bell is being hit continuously", + "bike bell is ringing continuously", + "a bike bell is ringing", + "a bike bell ringing", + "a bicycle bell rings", + "a bicycle bell is being rung", + "a bicycle bell ringing through a crowded street", + "a bicycle bell dings", + "a bicycle bell rings twice", + "a bicycle bell is ringing", + "a bicycle bell ring", + "a bicycle bell being rung several times", + "a bike bell ringing on the street", + "a bicycle rings its bell", + "A sharp ding from a bicycle bell alerted nearby pedestrians", + "The cheerful chime of a bicycle bell rang out clearly", + "A repetitive dinging sound came from a bicycle bell", + "The distinct ring of a bicycle bell broke through the ambient noise", + "A crisp bicycle bell tone signaled its approach" + ] + }, + { + "event": "bird", + "phrases": [ + "bird", + "bird tweet", + "the sounds of birds", + "the sounds of bird calls", + "pet birds tweet", + "birds tweet", + "bird tweets", + "birds chip", + "a bird sound", + "bird squawk", + "bird call", + "birdchirps", + "singing of a bird", + "birds tweets", + "the chirping of a bird", + "bird song", + "bird tweeting", + "a group of birds chirp", + "chirping of a bird", + "a series of bird chirping", + "The melodic chirping of birds filled the morning air", + "A series of high-pitched bird calls echoed nearby", + "The natural warbling of birds created a peaceful ambiance", + "The rhythmic tweeting of birds was unmistakable", + "A soft flurry of bird calls added life to the surroundings" + ] + }, + { + "event": "bird_flight", + "phrases": [ + "birds are taking flight", + "birds flight", + "birds are flying", + "birds taking flight", + "birds flight sound", + "birds flying sound", + "sounds of birds flight", + "birds fly around", + "birds fly nearby", + "a bird is fluttering in flight", + "sound of bird flight", + "sound of bird flying", + "birds flap their wings in flight", + "birds are making flight sounds", + "the sounds of bird flight", + "a bird flying away", + "a bird flying off", + "several birds fly", + "A soft flapping sound marked a bird in motion", + "The faint rustling of wings was audible as the bird took off", + "A steady whooshing noise accompanied the bird's flight", + "The gentle fluttering of wings echoed softly", + "The rhythmic beat of wings signified a bird in flight" + ] + }, + { + "event": "bird_vocalization", + "phrases": [ + "finch sound", + "a bird song phrase", + "calls from a single bird", + "a bird making a call", + "bird", + "birds outside", + "a bird tweets sharply", + "bird vocalizations outside", + "a bird calling three times", + "a bird song is playing", + "bird singing", + "bird songs", + "loud bird song", + "a bird is tweeting a bird song", + "a sweet bird song", + "a bird is chirping", + "a bird vocalizes repeatedly", + "a bird vocalizes", + "a drumming bird call", + "bird calling out", + "various bird calls", + "A melodious bird song resonated through the air", + "The chirping and tweeting of birds created a harmonious melody", + "A complex series of bird vocalizations filled the environment", + "The melodic trills and whistles of a bird added charm to the surroundings", + "The varied calls of birds blended into a soothing soundscape" + ] + }, + { + "event": "bleat", + "phrases": [ + "bleat", + "bleats", + "livestock bleat", + "goat bleeping", + "bleating", + "lamb bleating", + "an animal bleats three times", + "an animal bleat at a constant pace", + "a sheep bleats two times", + "a sheep baa baa", + "sheep baah", + "sheep bleat by her", + "goat bleat", + "goat bleating", + "repetitive bleating of a goat", + "an animal bleats loudly", + "animal bleats", + "goat bleeting", + "sheep bleat", + "the sheep bleat", + "sheep baaing", + "sheep", + "sheep baa", + "goat crying out", + "a sheep bleats a couple of times", + "the bleating cry of a sheep", + "animal bleating", + "The soft, repetitive bleating of a sheep echoed in the distance", + "A high-pitched bleat signaled the presence of a goat", + "The distinct bleating of livestock added a rural ambiance", + "A rhythmic bleat carried through the air, filling the surroundings", + "A lone bleat cut through the quiet, attracting attention" + ] + }, + { + "event": "boing", + "phrases": [ + "boing", + "a \"boing\"", + "a \"boing\" sound", + "boing sound", + "boing sound effect", + "boing sound effects", + "boinging", + "a boing", + "a boing occurs", + "boings", + "a boing sound", + "doorbell rings", + "a dong sounds", + "A sharp, springy boing sound echoed playfully", + "The resonant boing of a bouncing object was clear", + "A tonal, upward-sweeping boing sound filled the space", + "The unique spring-like boing sound was unmistakable", + "A quick boing sound broke the silence, adding a playful touch" + ] + }, + { + "event": "breaking", + "phrases": [ + "breaking", + "breaking sound", + "a breaking sound", + "something breaking", + "something breaks", + "things break", + "things break nearby", + "something is breaking", + "a sound of something breaking", + "the sound of things breaking", + "shatter", + "shatter sound", + "a shatter sound", + "something shatters", + "things shatter nearby", + "a sound of something shattering", + "A sharp, cracking noise signaled something breaking", + "The sound of shattering filled the air as an object broke apart", + "A loud snap followed by a crunch indicated breaking", + "The distinct cracking noise of breaking material was unmistakable", + "A sudden, loud break was heard as the object fractured" + ] + }, + { + "event": "breathing", + "phrases": [ + "breathing", + "breathing heavy", + "breaths", + "breathing intermittently", + "breathing noise", + "breathing sound", + "a breathing", + "breath blowing", + "breathes", + "breathing heavily", + "breathing audible", + "someone is breathing in and out close to the microphone", + "breathing like a breathing apparatus", + "breathing sounds", + "someone is breathing after holding breath", + "someone is breathing loudly in and out", + "brief heaving breathing", + "a person is breathing loudly and deeply", + "breathing with surface contact", + "human breathing", + "breathing is audible", + "someone heavily breathing close to the microphone", + "breath sounds", + "someone is breathing in and out", + "a person breathes moderately", + "breathing in between", + "man breathing", + "deep breathing", + "labored breathing", + "breathing over surface contact", + "A soft, rhythmic breathing sound was clearly audible", + "The steady inhale and exhale of breath filled the space", + "A faint, labored breathing noise indicated exertion", + "The sound of quickened breathing suggested heightened activity", + "A calm, slow breathing sound created a sense of relaxation" + ] + }, + { + "event": "burping", + "phrases": [ + "burping", + "burps", + "burping noises one after another", + "burping sounds", + "burping occurs", + "burping noises", + "burping sound coming from a person", + "non-distorted burps", + "several burping noises one after another", + "burping occurs repeatedly", + "human burping", + "someone burps for a few seconds in a row", + "repetitive burping", + "a series of deep burping noises one after another", + "burping noise", + "a short burp", + "a series of burping noises", + "a brief short burp", + "a burp occurs", + "a person burps loudly for a long time nearby", + "burping takes place", + "loud, long burps", + "a person burps loudly and steadily", + "a series of burps one after another", + "a person burbs for a period", + "a man burps for a time", + "A low, guttural burp interrupted the quiet", + "The sound of a sudden burp was unmistakable", + "A loud belch resonated briefly in the air", + "A short, sharp burping noise was heard in the background", + "The deep, rumbling sound of a burp added a humorous note" + ] + }, + { + "event": "bus", + "phrases": [ + "bus", + "a bus", + "bus sound", + "a bus passes", + "a bus passing by", + "a bus driving off", + "a bus driving", + "bus driving sound", + "bus moving", + "The steady hum of a bus engine filled the interior", + "A low rumbling sound accompanied the movement of the bus", + "The hiss of air brakes punctuated the bus's stop at a station", + "The rhythmic clatter of wheels on the road marked the bus's journey", + "A faint creaking noise came from the bus's suspension as it turned" + ] + }, + { + "event": "busy_signal", + "phrases": [ + "busy signal", + "busy signal sounds", + "busy signals", + "a telephone busy signal sounds", + "busy signals from a phone", + "a busy signal beeps", + "a busy signal ringing", + "a telephone is ringing with a busy signal", + "busy signals during telephone calls", + "busy signals play", + "A repetitive tone rang out over the phone line", + "The steady, rhythmic beeping of a busy signal filled the receiver", + "A monotone, pulsing sound indicated the line was engaged", + "The consistent beep of a busy tone was unmistakable", + "A sharp, repeating tone signaled the failure to connect" + ] + }, + { + "event": "buzz", + "phrases": [ + "buzz", + "buzzing", + "a buzz sound", + "buzzing noise", + "a short buzz", + "a buzz sounds", + "buzzing in stereo", + "it buzzes", + "buzzings", + "buzzes", + "buzzing occurs continuously", + "buzzing sound", + "a long buzz sounds", + "a buzz", + "buzz sounds", + "buzz noises", + "buzzing sounds", + "a quick buzz", + "buzzing vibrations", + "buzzing sounds produces", + "buzzing noises", + "another buzz", + "buzzing occurs", + "A rapid buzzing sound filled the air as something vibrated", + "The faint hum of a flying insect was audible nearby", + "A sharp, continuous buzz broke the silence", + "The background was filled with the low, vibrating buzz of motion", + "The rhythmic buzzing of wings created a natural vibration in the air" + ] + }, + { + "event": "buzzer", + "phrases": [ + "buzzer", + "buzzers", + "buzzer sound repeatedly in a series", + "buzzer sounds", + "a buzzer goes off", + "buzzers sound", + "a buzzer rings", + "a buzzer sound", + "buzzer is speaking", + "a buzzer going off", + "buzzers sound repeatedly", + "a buzzer sounds", + "a buzzer horn", + "an apartment buzzer is ringing", + "A loud, steady buzzing sound emitted from the device", + "The sharp tone of a buzzer signaled an alert", + "A mechanical buzzing noise indicated the activation of a signal", + "The high-pitched buzz of a warning device filled the room", + "A continuous, oscillating buzzer sound captured attention" + ] + }, + { + "event": "camera", + "phrases": [ + "camera", + "camera shutter", + "camera sound", + "single-lens reflex camera sounds", + "single-lens reflex camera", + "camera handling sound", + "camera mechanisms", + "camera flashes", + "camera interaction", + "camera effects", + "camera sounds", + "camera muffling", + "camera noise", + "cameras snapping", + "rustling with a camera", + "camera zooms", + "camera rattling", + "cameras taking pictures", + "camera clicking", + "camera tapping noise", + "a camera shot", + "A quick shutter click marked the capture of a photograph", + "A soft whirring sound accompanied the camera's autofocus", + "The distinct mechanical sound of a camera's shutter was heard", + "A brief beep indicated the camera was ready to shoot", + "The faint winding noise of a film camera added a nostalgic touch" + ] + }, + { + "event": "car", + "phrases": [ + "car", + "the sounds of cars", + "car sound", + "vehicle sound", + "the sound of cars", + "a car sound", + "car sounds", + "the sound of car", + "cars", + "vehicle move", + "a vehicle engine turns over", + "a car", + "car sound effect", + "car motor turning", + "car starts", + "vehicle sounds", + "a car passes by quickly", + "a car making vroom sounds", + "drive-by sound of a passenger car", + "The low purr of a car engine idling filled the space", + "A steady rumble accompanied the car's movement", + "The hiss of tires on the road was clearly audible", + "A faint clicking sound came from the car's turn signal", + "The rhythmic thud of the engine marked the car's operation" + ] + }, + { + "event": "car_alarm", + "phrases": [ + "car alarm", + "a car alarm is being set off and reset", + "a car alarm goes off repeatedly", + "a car alarm", + "a car alarm goes off", + "a car alarm is repeatedly sounding", + "car alarms", + "a car alarm is sounding repeatedly", + "a car alarm going off", + "cars alarm", + "a car sounds its alarm", + "a car alarm is being set", + "the sound of a car alarm", + "beeping of a car alarm", + "a car alarm beeps loudly", + "an alarm sounds on a motor vehicle", + "sound of a car alarm", + "a car alarm disarming beep", + "a car alarm is beeping", + "a car alarm is going off", + "a vehicle alarm", + "a car alarm blares", + "car alarms beep", + "a car alarm beeps", + "a car alarm ringing", + "a quick car alarm goes off", + "a car alarm is being replicated", + "a car alarm sounding", + "A loud, repetitive alarm tone blared from the car", + "The sharp, alternating siren of a car alarm filled the area", + "A high-pitched wailing sound came from the vehicle's alarm system", + "The insistent beeping of a car alarm signaled intrusion", + "A piercing alarm noise echoed as the security system activated" + ] + }, + { + "event": "car_passing_by", + "phrases": [ + "drive-by sound of a passenger car", + "vehicle pass by outside", + "a vehicle passed by and accelerates quickly", + "car sound", + "a car passes by quickly", + "a fast car moving away", + "a car passes by", + "vehicle sound", + "a car zooms by", + "vehicle driving away quickly", + "The low rumble of a car grew louder as it approached", + "A brief whooshing noise marked the car speeding past", + "The sound of tires on the pavement faded into the distance", + "A Doppler-shifted engine noise indicated movement past the listener", + "The rhythmic clatter of a car passing by was noticeable" + ] + }, + { + "event": "cat", + "phrases": [ + "cat", + "a cat meows", + "a cat hiss", + "a pet cat meows", + "a pet cat sound", + "a cat is growling", + "a cat purrs", + "a cat meowing in response", + "a cat meows and growls", + "a cat singing", + "cat sound", + "A soft purring sound came from the cat", + "The sharp meow of a feline was clearly audible", + "A low growling sound indicated the cat's displeasure", + "The rhythmic chirping of a cat added curiosity to its demeanor", + "The faint sound of a cat's paw scratching was heard nearby" + ] + }, + { + "event": "chainsaw", + "phrases": [ + "chainsaw", + "chainsaw cutting", + "chainsaw operate", + "a chainsaw runs cutting an object", + "a chainsaw runs before coming to an idle briefly", + "chainsaw running", + "chainsaws", + "chainsaw intermittent rev down", + "a chainsaw cutting and revving", + "chainsaw being run", + "a chainsaw slows down and revs again", + "chainsaw revs continuously", + "a chainsaw starting and revving", + "an electric chainsaw is turning on and off", + "a chainsaw motor", + "a chainsaw cutting", + "a chainsaw revving", + "a chainsaw operating and cutting through an object", + "a chainsaw cuts", + "a chainsaw engine running and revving up", + "a chainsaw is used and revved multiple times", + "a chainsaw revving sporadically", + "a chainsaw", + "a chainsaw revving up and down", + "chainsaw cutting wood", + "a chainsaw is started and begins cutting a solid object", + "a chainsaw runs and then stops", + "A loud, roaring sound indicated the chainsaw was in use", + "The rhythmic buzzing of the chainsaw filled the air", + "A high-pitched whine came from the chainsaw's blade cutting through material", + "The mechanical growl of the chainsaw was unmistakable", + "A sharp, grinding noise accompanied the chainsaw's operation" + ] + }, + { + "event": "cheering", + "phrases": [ + "cheering", + "cheering crowd", + "cheering crowds", + "cheering amid crowds", + "cheering with applause", + "cheering together", + "celebrations", + "crowd cheering", + "cheering at an event", + "cheers", + "cheering continues to come from the crowd", + "cheering with shouting", + "cheer", + "crowd cheers", + "crowd celebrations", + "background cheer", + "A loud, enthusiastic cheer erupted from the crowd", + "The sound of clapping and shouting created an energetic atmosphere", + "A rhythmic chant of cheering voices echoed across the area", + "The joyous sound of applause and cheering filled the air", + "A wave of cheering voices surged with excitement" + ] + }, + { + "event": "child_singing", + "phrases": [ + "a child singing voice", + "a child sings", + "a kid sings", + "a child singing", + "a kid singing", + "children sing", + "a child's singing", + "children's vocals", + "a child chants", + "child singing", + "a child is singing repeatedly", + "the child sings", + "children are singing in call and response", + "an older child singling", + "a child sound", + "a kid is singing", + "A soft, melodic voice of a child sang a tune", + "The high-pitched singing of a child filled the room", + "A faint humming noise accompanied the child's singing", + "The cheerful, rhythmic singing of a child was unmistakable", + "A gentle lullaby-like singing came from the child" + ] + }, + { + "event": "child_speech", + "phrases": [ + "kid speech", + "child speech", + "toddler speaking", + "child's speech", + "children's speech", + "children speech", + "child speeches", + "a speaking child", + "kids speech", + "children saying goodbye", + "a kid speaks", + "a child speaks", + "a child speak", + "young child speaking", + "kid speaking", + "a young child speaks loudly", + "a young child speaks", + "a child answers", + "a child is speaking phrases", + "a young child speaking", + "a child speaking", + "child speaks", + "a kid's voice", + "a young child is making a speech", + "a kid talk", + "a young kid speaks", + "child speaking", + "a child says words", + "A high-pitched voice uttered words in a child's tone", + "The soft chatter of a child was heard in the background", + "A cheerful, energetic voice marked the child's speech", + "The rhythmic articulation of a child speaking was audible", + "The playful tone of a child added liveliness to the surroundings" + ] + }, + { + "event": "chime", + "phrases": [ + "chime", + "a mystery chime", + "a short chime", + "chime accompaniment", + "a sound for a positive event in a game", + "a shining sound effect", + "a bell chime", + "a sound effect signaling a transition or completion", + "a chime", + "A soft, melodic chime rang out clearly", + "The resonant sound of chimes filled the air", + "A brief, high-pitched chime marked the passage of time", + "The rhythmic ringing of chimes created a soothing ambiance", + "The gentle tone of a chime added a musical touch to the environment" + ] + }, + { + "event": "chirp", + "phrases": [ + "chirp", + "bird tweet", + "chirp tone", + "edited bird chirp sounds", + "electronic sound effect", + "brids chirp", + "pet birds tweet", + "birds chip", + "a bird tweets sharply", + "bird vocalizations outside", + "a bird calling three times", + "a bird song is playing", + "bird singing", + "bird songs", + "loud bird song", + "a bird is tweeting a bird song", + "a sweet bird song", + "a bird is chirping", + "a bird vocalizes repeatedly", + "a bird vocalizes", + "a drumming bird call", + "chirps", + "A quick, high-pitched chirp came from a small bird", + "The rhythmic chirping of birds was clearly audible", + "A brief, melodic chirp broke the silence", + "A soft chirp added a natural element to the soundscape", + "The continuous chirping of small birds created a lively atmosphere" + ] + }, + { + "event": "civil_defense_siren", + "phrases": [ + "civil defense siren", + "a long drawn-out siren, tapering off at the end", + "a civil defense siren sounds", + "a european siren approaches", + "a civil defense siren blaring and winding down", + "a defense siren sounds", + "a siren blasts close by", + "a civil defense siren blares", + "a civil defense siren is ringing", + "a civil defense siren is going off", + "a civil defense siren blares loudly in the distance", + "a civil defense siren blares loudly", + "a civil defense siren is blaring", + "a civil defense siren blares in the distance", + "a civil defense siren blow", + "a civil defense siren is sounding", + "A loud, wailing siren echoed across the area, signaling an emergency", + "The oscillating tone of a civil defense siren filled the air", + "A high-pitched, rising and falling siren sound warned of impending danger", + "The continuous blare of a siren created an atmosphere of urgency", + "The distinct sound of a warning siren was unmistakable" + ] + }, + { + "event": "clang", + "phrases": [ + "clang", + "metal clang", + "a metallic clang", + "metal clank", + "a loud metal clang", + "metallic percussion hit", + "a louder metal clang", + "a clang sound", + "clangs", + "a metal hit", + "a metallic object hits", + "a metal clang", + "metal hits", + "metal is being hit by a hammer and ringing", + "a loud metal clank", + "metal clink", + "a cling", + "a metallic clank", + "A loud metallic clang resonated as the object was struck", + "The sharp, echoing clang of metal filled the space", + "A sudden, discordant clang broke the silence", + "The resonant clang of hollow metal rang out clearly", + "The deep, vibrating clang of a struck metal structure was audible" + ] + }, + { + "event": "clapping", + "phrases": [ + "clapping", + "clapping hands", + "clapping begins", + "clapping ensues", + "applauding and clapping", + "hands clapping loudly", + "hands clapping in applause", + "clapping at an event", + "applause and clapping", + "clapping and applause", + "clapping and applause sounds", + "mid frequency applause", + "applause being given", + "applauding", + "clapping noises", + "a smattering of applause", + "mid frequency applauding", + "clapping sounds", + "clapping from group of people", + "a crowd makes applause noises", + "an audience applauds continuously", + "clapping occurs", + "clapping takes place", + "loud clapping", + "a loud chorus of clapping", + "a clapping", + "a loud applause", + "an audience claps continuously", + "an audience applauding continuously", + "continued clapping", + "A single, sharp clap echoed in the room", + "The rhythmic clapping of hands created a percussive sound", + "A soft, quick clap marked the sound of approval", + "The faint sound of hands clapping was heard nearby", + "A solitary clap broke through the ambient noise" + ] + }, + { + "event": "clicking", + "phrases": [ + "clicking", + "tracks click", + "clicking sound effects", + "a clicking", + "ticks", + "gears click", + "clicks", + "a click repeating", + "tracks clicking", + "clicking occurs consistently", + "clicking repeatedly", + "a quick click", + "ticks intermittently", + "clicking occurs repeatedly", + "quick clicking", + "a click", + "a rapid, regular soft click sounds from nearby", + "a click track", + "a clicking several times", + "repetitive quick ticking", + "A quick, sharp clicking sound came from the device", + "The faint click of small objects tapping together was audible", + "A series of rhythmic clicking noises filled the air", + "The brief, distinct sound of a click broke the silence", + "A mechanical clicking noise indicated the activation of a switch" + ] + }, + { + "event": "computer_keyboard", + "phrases": [ + "typing sound", + "keyboard", + "keyboard sound", + "touchscreen typing", + "spray sound", + "tap tap sound", + "typing sound on computer keyboard", + "computer keyboard mechanisms", + "a man is typing on a computer keyboard", + "touchscreen typing", + "someone is typing keys on a computer keyboard", + "typing on a computer keyboard", + "typing on keyboard", + "typing noise", + "fingers typing on a keyboard", + "computer keyboards sound", + "The rhythmic tapping of keys created a steady typing sound", + "A soft clicking noise accompanied each keystroke", + "The sound of rapid typing filled the room with energy", + "The faint clatter of a keyboard was heard in the background", + "The mechanical clicking of keys indicated active use" + ] + }, + { + "event": "cough", + "phrases": [ + "cough", + "coughing", + "boy coughing", + "a boy coughs deeply", + "coughs", + "continuous coughing", + "man coughing", + "human coughing", + "a male coughing", + "male coughing", + "an adult male coughs", + "an adult male clearing his throat", + "someone coughs several times in quick succession", + "female coughing", + "cough sounds", + "a male coughs", + "a male clearing his throat", + "a man cough", + "two females coughing", + "someone is coughing a number of times", + "a cough", + "a female coughs", + "men cough", + "he coughs", + "coughing men", + "baby coughing", + "coughing sounds", + "a coughing man", + "an adult female coughs", + "A sudden, sharp cough broke the silence", + "A low, muffled cough was audible nearby", + "The repeated sound of coughing filled the air", + "A harsh, guttural cough echoed briefly", + "The distinct sound of a cough indicated discomfort" + ] + }, + { + "event": "cricket", + "phrases": [ + "cricket", + "crickets", + "crickets chirp intermittently in the background", + "crickets at sunset", + "crickets chirp continuously in the background", + "the sounds of crickets", + "crickets are chirping in the forest", + "crickets are chirping rapidly and loudly", + "crickets are chirping in a forest", + "cricket field recording", + "synthesized cricket sounds", + "a cricket chirping at steady intervals", + "cricket chirp", + "crickets chirp intermittently", + "crickets in bushes", + "crickets in recording", + "crickets chirp in background", + "crickets croak in the background", + "crickets chirp in the background", + "cricket sound recorded ", + "the chirp of crickets in the background", + "crickets are making a steady sound", + "crickets chirping in the background", + "croaking crickets", + "a cricket chirping loudly", + "crickets are chirping in the mountains", + "a continuous chorus of cricket sounds", + "crickets are chirping in background", + "crickets vocalize", + "crickets chirp from a distance", + "The rhythmic chirping of crickets filled the night air", + "A faint, continuous cricket song was audible in the background", + "The persistent chirping of a cricket added to the nocturnal ambiance", + "The high-pitched chirping of crickets created a natural melody", + "The distinct sound of crickets marked the quiet of the night" + ] + }, + { + "event": "croak", + "phrases": [ + "frogs croak", + "frogs chirp", + "multiple frogs croak at the same time", + "numerous number of frogs croaking", + "frogs vocalize", + "several frogs croak", + "long croaking from a frog", + "croaks", + "long groaning from a frog", + "frogs croak nearby", + "a frog croaks repeatedly", + "frogs making croaking sounds", + "frog sticks tongue out", + "A low, guttural croak came from a nearby frog", + "The harsh croaking of a frog echoed across the pond", + "A deep, raspy croak marked the frog's call", + "The repetitive croaking of frogs created a natural chorus", + "The faint sound of a croak was audible in the distance" + ] + }, + { + "event": "crowd", + "phrases": [ + "crowd", + "crowds", + "crowds gather", + "crowd is making noise from a medium perspective", + "large crowd", + "crowd of people", + "murmuring crowd", + "a loud bustling crowd", + "the noisy crowd", + "a large crowd of people are noisy", + "people crowd", + "a bustling crowd", + "the noise of the crowd", + "crowded human voices", + "a crowd hubbub", + "crowd voice", + "a large noisy crowd", + "a noisy crowd", + "crowd with human voices", + "people crowding", + "a large noisy crowd having fun", + "background noise at a crowded event", + "the noise of a crowd", + "background noise of a crowd", + "a large crowd mummers", + "a crowd's noise", + "a noisy crowd of people", + "a crowd makes hubbub", + "a crowd of people is milling around loudly and very close", + "a crowd make noise", + "The indistinct murmur of a crowd filled the background", + "A sudden cheer erupted from the crowd, breaking the ambient noise", + "The sound of overlapping conversations created a lively atmosphere", + "A wave of applause and chatter came from the large gathering", + "The rhythmic chanting of a crowd echoed through the area" + ] + }, + { + "event": "crowing", + "phrases": [ + "crowing", + "crowing sounds", + "a crowing sound", + "cock crowing", + "roosters caw", + "rooster crows", + "more crowing", + "rooster is crowing", + "various chicken crowing", + "crowing from bird", + "a cockrel crowing", + "a crowing", + "a cock crowing", + "cawing crows", + "roosters are crowing", + "crowing roosters", + "rooster crowing", + "a chicken crows", + "crows", + "crow caws", + "chickens cawing", + "some kind of bigger bird crows continuously", + "a rooster is crowing", + "a chicken is crowing", + "roosters and chickens are crowing", + "fowl are crowing", + "a chicken crowing", + "crows coo", + "a chicken is cawing", + "the sound of crows", + "A loud, rhythmic crowing sound came from a nearby rooster", + "The distinctive multi-syllable crow of a rooster filled the morning air", + "The sharp crowing of a rooster signaled the break of dawn", + "The repetitive crowing of a rooster echoed through the farm", + "A high-pitched crowing noise indicated the presence of a rooster nearby" + ] + }, + { + "event": "crumpling", + "phrases": [ + "crumpling", + "crumpling paper", + "crumpling occurs continuously", + "crumbling paper", + "an item crumpling", + "crumpling an object", + "crinkling", + "crumpling some material", + "paper crumpling", + "crumpling occurs repeatedly", + "crumpling with surface contact", + "paper is crinkling and crumpling", + "the sound of crumpling", + "crinkling a rubber object", + "plastic is crinkling and crumpling", + "paper is crumbling and crinkling", + "the crinkling of plastic", + "the sound of crumpling paper", + "paper is crumpling consistently", + "crumpling some packet", + "crumpling of an object", + "crackling paper", + "continuous crumpling", + "crumpling of material", + "something crumples and crinkles", + "wrapper crinkling", + "paper is being crumpled and crinkled", + "paper is crumpling continuously", + "paper is crumpled and crinkled", + "A soft, crackling sound came from crumpling paper", + "The distinct rustling of material being crumpled filled the air", + "A faint crumpling noise marked the handling of a flexible sheet", + "The sharp, crisp sound of aluminum foil crumpling was audible", + "The rhythmic sound of crumpling paper created a subtle texture in the background" + ] + }, + { + "event": "crunch", + "phrases": [ + "crunch", + "cookie crunch", + "a crunch", + "crunching", + "crisp crunches", + "crumpling sound", + "A loud, crisp crunch came from the brittle material breaking", + "The sharp crunch of footsteps on gravel was clearly audible", + "A faint, repeated crunching sound marked the crushing of a substance", + "The distinct crunch of a brittle object breaking was unmistakable", + "The rhythmic crunching of leaves underfoot created a natural soundscape" + ] + }, + { + "event": "crying", + "phrases": [ + "crying", + "an adult male sighs while crying", + "an adult male sobs while crying", + "a woman sobs", + "crying (fake crying)", + "a woman sobbing", + "a man sobbing", + "a young woman crying", + "occasional sobbing", + "a woman sobs loudly", + "a person makes sobbing noises", + "crying over a television", + "an adult female is sobbing", + "crying noise", + "someone sobs", + "crying sounds", + "someone is sobbing", + "someone is crying in a cartoonish manner", + "someone sobbing", + "someone sobs intermittently", + "a man pretend crying", + "a person sobbing", + "someone else is sobbing", + "a man sobs", + "woman crying", + "a woman is crying hysterically in pain", + "a young boy crying", + "someone cries intermittently", + "a person is crying and sobbing", + "crying noises", + "A soft, trembling sobbing sound broke the silence", + "The faint, erratic breathing noises of crying were audible", + "A rhythmic, muffled crying sound filled the air", + "The high-pitched wailing of someone crying echoed in the distance", + "The repetitive sobbing noises created an emotional atmosphere" + ] + }, + { + "event": "dental_drill", + "phrases": [ + "dental drill", + "the sound of a dental drill", + "a dentist drill", + "a dental drill", + "a drill is being used at the dentist", + "A high-pitched, whirring sound came from the dental drill", + "The sharp, continuous whine of the drill filled the room", + "A piercing sound accompanied the use of the dental drill", + "The faint vibration noise of the drill was audible in the background", + "The rhythmic, mechanical sound of the dental drill was unmistakable" + ] + }, + { + "event": "dial_tone", + "phrases": [ + "dial tone", + "electronic dial tone", + "dial tones", + "electronic touch tone telephone dialing", + "telephones dial and ring", + "dialing tones", + "a telephone dialing tone ringing", + "a telephone dial tone", + "a telephone dial tone occurs", + "people dial", + "keypress tone", + "dialing", + "dialing occurs on a telephone", + "a telephone is dialed once", + "a telephone dialing tone", + "keypress tones", + "a phone dialing", + "men are dialing on a phone with a mechanical tone", + "a man presses buttons creating tones on a telephone", + "a wireless phone is turning on and dialing", + "A steady, monotone hum indicated the line was ready", + "The low, continuous tone of the dial tone filled the receiver", + "A faint, consistent hum marked the sound of the dial tone", + "The rhythmic, unchanging tone of a dial signal was audible", + "The distinct sound of a dial tone confirmed the connection" + ] + }, + { + "event": "ding", + "phrases": [ + "ding", + "a ding", + "a ping", + "dinging", + "a ting occurs", + "a metal ding", + "a tinging occurs", + "ding repetitions", + "a bell ding", + "a bell donging", + "ding sounds from a video game", + "an electronic ding-dong", + "a ting", + "dings", + "a final ding", + "a ding repeats", + "a dinging sound", + "a metal ting", + "a single ding", + "a metal bell ding", + "a bell tings", + "ding-dongs", + "a ding-dong sound", + "A quick, high-pitched ding sound rang out clearly", + "The soft, metallic ding of a small object being struck was audible", + "A sharp ding noise echoed briefly in the room", + "The faint, rhythmic ding of a bell added a delicate touch", + "A single, crisp ding marked an event in the background" + ] + }, + { + "event": "ding-dong", + "phrases": [ + "a ding-dong", + "ding-dongs", + "ding-dong sounds", + "a ding-dong sound", + "a ding-dong sound effect plays", + "a ding-dong sound in the background", + "an electronic ding-dong", + "ding-dong sound", + "a ding-dong in a small room", + "a doorbell ding-dongs", + "A melodic ding-dong chime rang out", + "The rhythmic two-tone ding-dong sound filled the air", + "A clear, cheerful ding-dong announced someone's presence", + "The soft echo of a ding-dong chime lingered briefly", + "The distinct ding-dong of a doorbell was unmistakable" + ] + }, + { + "event": "dog", + "phrases": [ + "dog", + "the dog sound", + "dogs", + "a dog making a sound", + "dog contact sounds", + "dog sounds", + "a dog", + "pet dog barking", + "dog sound effect", + "the sound of a dog", + "dog barking", + "a dog barks in response", + "dog yips", + "a dog is reacting to something violently", + "real dog bark", + "dog surface contact", + "dogs bark several times", + "bark", + "a dog barking in response", + "dog barks twice", + "several dogs make bow-wow", + "dog yipping", + "a dog is trying to bark in its sleep", + "dogs fight", + "A low, rumbling growl came from the dog", + "The sharp bark of a dog echoed nearby", + "A soft whimper was heard as the dog communicated its emotions", + "The rhythmic panting of a dog created a steady background noise", + "The excited yapping of a dog filled the air" + ] + }, + { + "event": "door", + "phrases": [ + "door", + "door closing", + "a door", + "door opening", + "doors", + "a closing door", + "door opens", + "door bang", + "door shutting", + "door slamming", + "closing an old wooden door", + "a door closing", + "the closing of a door", + "closes a door", + "door open and close", + "the sound of a door", + "door clanking", + "door slams", + "a door to a block of flats is closing", + "man closing door", + "a door opening and then closing", + "a person opening and closing a door", + "a door opens and closes shut", + "an opening of a door", + "someone is closing a door in a bathroom", + "a door shut", + "the opening and closing of a door", + "A creaking sound marked the slow opening of a door", + "The sharp slam of a door closing echoed briefly", + "A faint squeak accompanied the movement of the door hinges", + "The rhythmic knocking on a door was clearly audible", + "The soft thud of a door shutting filled the room" + ] + }, + { + "event": "doorbell", + "phrases": [ + "doorbell", + "doorbell rings", + "doorbell sounds", + "doorbell ringing", + "apartment doorbell", + "a doorbell", + "doorbell chimes", + "a doorbell ding-dongs", + "a doorbell with an electric chime is ringing", + "a doorbell rings with ding-dong sounds", + "a doorbell chime", + "a door bell", + "doorbell noises", + "doorbells", + "a ringing doorbell", + "a bell on a shop door", + "mechanical door bell", + "a doorbell rings", + "A sharp, melodic chime of the doorbell rang out", + "The rhythmic ding-dong of a doorbell filled the air", + "A quick, clear doorbell tone announced a visitor", + "The faint ringing of a doorbell was heard in the background", + "A cheerful ding-dong sound echoed briefly as the doorbell was pressed" + ] + }, + { + "event": "drill", + "phrases": [ + "drill", + "hole is being drilled", + "loud drilling", + "drilling", + "drilling noises repeat several times", + "the sounds of a drill", + "a tool loudly drills into something", + "a power tool drilling again", + "a loud drill", + "continuous drilling loudly", + "a drill is drilling and subsequently ceases operation", + "the drilling of an object", + "a power tool sharply drilling", + "a power tool is drilling", + "the sounds of drilling", + "a power tool continues to make drilling noises", + "a drill spins loudly nearby and then stops", + "drill running", + "drill runs", + "drill running and shutting down", + "a drill drills repeatedly", + "a drilling and whirring sound", + "a loud drill into something and then turns off", + "a drill runs repeatedly", + "the sound of a drill being used repeatedly", + "drill working multiple times", + "drill getting stuck and stopping", + "someone is drilling a sheet of plywood", + "continuous drilling", + "power tools drilling", + "A high-pitched whirring sound came from the drill", + "The sharp grinding noise of the drill filled the room", + "A repetitive buzzing sound marked the drill's operation", + "The faint vibration noise of a drill was clearly audible", + "The rhythmic hum of the drill created a steady background noise" + ] + }, + { + "event": "ducks", + "phrases": [ + "ducks", + "ducks quack", + "ducks quacking", + "ducks call", + "ducks quack up close", + "ducks quacking continuously", + "ducks quack intermittently", + "ducks quack continuously", + "ducks quacking loudly", + "quacking ducks", + "ducks quaking", + "several ducks quack one after another", + "ducks quack repetitively", + "ducks quack loudly nearby", + "ducks quack loudly", + "ducks quack several times nearby", + "ducks quack multiple times in the distance", + "ducks squawk repeatedly", + "ducks are making waterfowl sounds", + "real ducks quack", + "multiple ducks quack repeatedly", + "a series of ducks quacking", + "ducks quaking continuously", + "a group of ducks are quacking", + "multiple ducks quack continuously", + "ducks quacking irregularly", + "several ducks are quacking intermittently", + "a number of ducks quacking at once", + "ducks respond to calls", + "a number of ducks quack", + "A loud quacking sound came from a group of ducks", + "The rhythmic quack of a duck filled the air", + "A soft, repetitive quacking noise was heard nearby", + "The distinct quack of ducks created a lively soundscape", + "The faint calls of ducks echoed in the background" + ] + }, + { + "event": "echo", + "phrases": [ + "echo", + "distant voices echo", + "echoes", + "echoing", + "A delayed, faint echo repeated the original sound", + "The sound of footsteps echoed off the walls", + "A clear, resonant echo filled the spacious area", + "The rhythmic echo of a voice lingered briefly", + "The distant echo of a clap was audible across the canyon" + ] + }, + { + "event": "electric_shaver", + "phrases": [ + "shaver", + "electric shaver", + "electric shaver sounds", + "electric razor sounds", + "electric shaver sound", + "an electric toothbrush buzzes", + "an electric shaver is switched off", + "an electric razor buzzes", + "the sound of a shave machine", + "sound of an electric shaver", + "a long buzz shaves something of wood", + "an electric shaver running", + "an electric shaver buzzes", + "an electric shaver is turning on and off", + "electric hair clipper being used", + "A steady buzzing sound came from the electric shaver", + "The sharp hum of the shaver filled the room", + "A faint vibration noise accompanied the shaver's operation", + "The rhythmic buzzing of the shaver created a consistent background tone", + "The distinct sound of the electric shaver was unmistakable" + ] + }, + { + "event": "emergency_vehicle", + "phrases": [ + "emergency vehicle", + "an ambulance blares its siren", + "emergency vehicle siren", + "ambulance", + "an emergency vehicle turns on the siren", + "an emergency vehicle siren changes to a higher pitched siren", + "an ambulance moving with its siren on", + "ambulance siren", + "an emergency vehicle approaches with its siren blaring", + "ambulance siren wail", + "an ambulence using its siren", + "emergency vehicle siren passing by", + "police emergency vehicle siren", + "a vehicle with sirens blaring approaches", + "a police car siren wails quickly", + "emergency vehicle siren blasts", + "an emergency vehicle passes by quickly", + "emergency vehicle's siren blares", + "emergency vehicle siren blaring steadily", + "a two-tone emergency vehicle siren blows as it approaches, passes by, and then fades into the distance", + "an emergency vehicle's siren wails", + "emergency vehicle sirens wail multiple times", + "a siren ringing of a passing emergency vehicle", + "an emergency siren passes by", + "police car with sirens blaring passing by", + "an ambulance is blaring its siren", + "fast urgent loud emergency siren", + "A loud, wailing siren signaled the approach of an emergency vehicle", + "The sharp, oscillating siren of an emergency vehicle filled the air", + "A high-pitched siren noise indicated the vehicle's urgency", + "The repetitive blaring of a siren announced the presence of an emergency vehicle", + "The distinct Doppler-shifted siren sound moved past quickly" + ] + }, + { + "event": "engine", + "phrases": [ + "engine", + "engine run", + "running engine", + "another engine", + "engine running", + "car engine", + "engine motor running", + "engine operate", + "an engine", + "motor engine", + "boat engine", + "an engine run", + "auto engine", + "light engine", + "engine sound effect", + "a engine runs", + "engine running once again", + "the engine runs", + "a second engine", + "engine in idle", + "its engine runs", + "engine sound", + "its engine running", + "motorboat engine", + "A deep, steady hum of an engine filled the space", + "The rhythmic thrum of the engine indicated it was in operation", + "A sharp rev of the engine broke the silence", + "The faint vibration noise of an engine was audible in the background", + "The low growl of the engine created a mechanical ambiance" + ] + }, + { + "event": "engine_knocking", + "phrases": [ + "engine knocks", + "a motor knocks", + "a car engine knocking", + "engine knocking sounds", + "a car makes an engine knocking sound", + "knocking engine", + "an engine runs and knocks", + "a motor runs and knocks", + "knocking engines", + "engine knocking noises repeatedly", + "engine knocking", + "a vehicle engine runs knocking", + "a medium engine making knocking sounds", + "car making engine knocking sounds", + "a car's engine is knocking", + "clicking from a engine", + "an engine knocks while running", + "an engine that is knocking starting", + "an engine making a knocking noise", + "a medium engine makes knocking noises", + "engine sounds with tapping and thumping", + "a vehicle engine idles and knocks briefly", + "the engine is knocking", + "clicking of an engine", + "a motorcycle engine knocks", + "a car engine knocks", + "A sharp, metallic knocking sound came from the engine", + "The rhythmic pinging noise indicated a malfunction in the engine", + "A faint, repetitive knocking sound was audible as the engine ran", + "The distinct metallic ping of engine knocking was unmistakable", + "A loud, irregular knocking noise signaled engine trouble" + ] + }, + { + "event": "engine_starting", + "phrases": [ + "its engine starts", + "engine starts", + "a engine starting", + "car starts", + "a vehicle starter turns over", + "a motorboat starts up", + "engine starting", + "engine attempting to start", + "motor startup", + "a motor engine starting", + "an engine starts a second time", + "engine being started again", + "a engine starts up", + "an engine starts", + "an engine takes a few seconds to start", + "a motor starts", + "engines start", + "motor engine starts", + "a man starts an engine", + "an engine start", + "A loud, cranking noise marked the engine starting", + "The sharp whir of the starter motor was followed by the engine's hum", + "A faint clicking sound preceded the engine's ignition", + "The rhythmic revving noise indicated the engine was coming to life", + "A brief sputtering sound was followed by the steady hum of the engine" + ] + }, + { + "event": "explosion", + "phrases": [ + "explosion", + "an explosion sound", + "an energy-type explosion sound", + "explosion sounds", + "large explosion", + "a quick loud explosion", + "an explosion occurs", + "huge explosion", + "a explosion sound", + "an explosion", + "sudden explosion", + "another explosion explodes", + "a blast", + "an explosion sound effect", + "large deep explosion", + "a synthetic explosion sound", + "a disappointing explosion", + "an explosion noise", + "an explosive sound", + "an explosion happens", + "a sound effect explosion", + "A loud, booming explosion echoed across the area", + "The sharp crack of an explosion was followed by a rumbling noise", + "A sudden, deafening blast filled the air", + "The distinct sound of a powerful explosion was unmistakable", + "A deep, resonant boom signaled the detonation of explosives" + ] + }, + { + "event": "fart", + "phrases": [ + "fart", + "fart over and over", + "a fart", + "farts", + "farting", + "fart sound effects", + "a fart sound effect", + "someone farts", + "a fart loop", + "a long fart", + "the sound of a fart", + "a sound of a fart echoing in a sound booth", + "a fart sound effect plays", + "fart sounds", + "a recorded fart", + "a funny fart", + "men fart", + "human farting", + "a fart escapes", + "someone is composing their own farts in a loop", + "a series of farting", + "someone is making a fart sound effect", + "fart noises", + "A brief, low-pitched fart noise broke the silence", + "The sharp, quick sound of flatulence was clearly audible", + "A faint, muffled fart sound lingered briefly", + "The repetitive, comical sound of flatulence created a humorous tone", + "A loud, resonant fart noise filled the room momentarily" + ] + }, + { + "event": "female_singing", + "phrases": [ + "a female singer performs several songs", + "a female singer performs multiple times", + "a female singer is performing", + "a female sing", + "a female singer performs", + "a female sings", + "a female vocalist", + "female singer performs", + "a female singer sings", + "a female voice sings multiple times", + "a female is singing with lots of reverb", + "a female singer sing", + "a female voice sings", + "a female singer", + "two women sing", + "a female singer speaking", + "female voices sing a song", + "a female singing with mechanisms", + "male and female singers perform", + "female singers perform", + "a female singer singing", + "singing with female vocals", + "a female voice sings continuously", + "a female voice is singing a song", + "female sings", + "a female singer performing", + "a female sings along", + "a female singer perform", + "female vocals sing", + "A soft, melodic voice of a woman sang a gentle tune", + "The clear, high-pitched singing of a woman filled the air", + "A rich, vibrant tone marked the woman's singing voice", + "The rhythmic, soothing melody of female singing created a peaceful ambiance", + "The faint, harmonious hum of a woman singing was audible in the distance" + ] + }, + { + "event": "female_speech", + "phrases": [ + "female speech", + "female speeches", + "female english speech", + "a spoken female voice", + "woman's speech", + "female speaking", + "a female speaking", + "a single female voice speaking", + "a female speaks", + "female making speech", + "brief female speech", + "continuous female speech", + "a woman speeches", + "female speaks", + "a woman speeching", + "a womans dialogue", + "women's speech", + "female voice", + "a woman is giving a confident speech", + "a female voice sporadically speaks", + "a female speaker speaking", + "female voice speaking", + "a female voice is narrating", + "a female voice speaks", + "a female voice speaking", + "repeated female speech", + "a woman having a narration", + "additional female speech", + "a females voice speaks out", + "a young woman speeches", + "A calm, steady voice of a woman was heard nearby", + "The rhythmic articulation of a woman's speech filled the space", + "The clear and deliberate tone of a female voice conveyed confidence", + "A soft-spoken female voice was audible in the background", + "The cheerful, lively speech of a woman added energy to the environment" + ] + }, + { + "event": "filing_(rasp)", + "phrases": [ + "dry scraping", + "filing (rasping) sounds", + "metal filing and scrapping a surface", + "filing (rasp) sounds", + "a filing sound", + "a man is filing something", + "wood being filed", + "a file rasps against a surface several times nearby", + "a man uses a filing rasp", + "rasping of file", + "rasping", + "a rasping filing continues", + "metal filing", + "a man is filing a piece of wood", + "scrapping and filing of wood", + "a rasping sound of filing", + "a rasping and filing sound", + "a file rubbing against a surface loudly", + "sharp filing sounds", + "A sharp, rasping sound came from the file against the metal", + "The rhythmic scraping noise of filing filled the workshop", + "A coarse, grating sound marked the action of a file on wood", + "The faint, repetitive rasping sound accompanied the filing motion", + "The distinct, metallic grinding noise of filing was unmistakable" + ] + }, + { + "event": "fire", + "phrases": [ + "fire", + "fires", + "fire effects", + "fire cracking sound", + "a fire burning inside", + "fire burning sound", + "A soft crackling sound came from the burning fire", + "The rhythmic popping of flames filled the air", + "A low, steady roar indicated the presence of a large fire", + "The faint hiss of burning material was clearly audible", + "The sharp, crackling noise of fire created a lively soundscape" + ] + }, + { + "event": "fire_alarm", + "phrases": [ + "fire alarm interior field recording", + "fire alarm", + "a fire alarm is tested and switched off", + "a fire alarm beeps continuously", + "a fire alarm sounds multiple times", + "fire alarm is sounding in a hospital", + "a fire alarm repeatedly", + "fire alarm with mechanisms", + "A loud, repetitive alarm tone blared across the room", + "The sharp, high-pitched wailing of the fire alarm filled the building", + "A rhythmic beeping noise signaled an emergency", + "The continuous blare of the fire alarm demanded attention", + "The distinct, piercing sound of a fire alarm was unmistakable" + ] + }, + { + "event": "fire_engine", + "phrases": [ + "fire engine", + "fire engine siren-blaring", + "fire trucks", + "fire trucks sound off", + "fire brigade signal", + "fire trucks sound", + "firetruck getting closer", + "fire truck pulls out", + "a fire truck driving", + "fire engines", + "fire engine approaching", + "fire truck", + "a fire engine is passing by", + "fire truck sirens outside", + "a fire truck is passing by", + "fire engine drives", + "the fire truck drives by", + "a fire engine starts and honks horn", + "a fire engine moving", + "a fire truck is driving by", + "crackling fire engine sirens", + "a fire truck is approaching", + "fire engines sound", + "a fire engine is making brakes", + "a fire engine is accelerating and making engine sounds", + "fire engine horns", + "a loud fire engine", + "a fire truck runs", + "a fire truck drives with sirens", + "A loud, wailing siren signaled the approach of a fire engine", + "The sharp, oscillating siren of a fire engine filled the air", + "A high-pitched siren noise indicated the urgency of the situation", + "The repetitive blaring of a siren announced the presence of a fire engine", + "The deep, rumbling engine noise of the fire truck accompanied its siren" + ] + }, + { + "event": "fireworks", + "phrases": [ + "fireworks", + "firework booms", + "fireworks sounds", + "fireworks go off", + "fireworks are echoing across a valley", + "fireworks noises", + "firework sounds", + "firework celebration", + "fireworks explode", + "firecrackers", + "fireworks hiss", + "occasional fireworks", + "fireworks are being recorded inside a house near an open window", + "fireworks explode crackling", + "fireworks burst", + "fireworks pop", + "fireworks near and far", + "fireworks going off", + "a series of several fireworks exploding one after another", + "the sounds of fireworks", + "fireworks burst loudly", + "fireworks explode and echo", + "multiple fireworks pop and crackle", + "a fireworks display takes place", + "fireworks crackle", + "fireworks fizzle", + "fireworks are going off outside an apartment", + "firecrackers go off", + "firecrackers burst", + "the sound of fireworks", + "A loud, cracking noise marked the explosion of fireworks", + "The sharp whistling sound of a firework ascending filled the air", + "A series of rapid pops and bangs created an energetic soundscape", + "The deep boom of a large firework echoed across the area", + "The faint crackling of fireworks added a festive ambiance" + ] + }, + { + "event": "fixed-wing_aircraft", + "phrases": [ + "fixed-wing aircraft", + "an aircraft's engine", + "airplane", + "aircraft", + "aircraft approach", + "the sound of a fixed-wing aircraft", + "the sounds of an aircraft", + "the sound of fixed-wing aircraft", + "aircraft softly accelerating", + "the sound of a fixed-wing airplane", + "airplane sounds", + "an aircraft engine gets louder as it approaches", + "an airplane engine runs consistently", + "an aircraft engine swirls in the background", + "a muted jet", + "plane taking off or landing", + "an aircraft intermittently", + "aircraft fly", + "steady jet engine running", + "engine of airplane taking off", + "an aircraft's propeller", + "large aircraft taking off", + "airplane ambience internal ground", + "fixed-wing aircraft fly overhead", + "humming of a nearby jet engine", + "A deep, steady hum of the aircraft filled the sky", + "The rhythmic drone of the engine signaled the aircraft's flight", + "A faint roar of the aircraft was audible in the distance", + "The oscillating whine of the aircraft engine created a mechanical soundscape", + "The continuous sound of propellers cutting through the air was distinct" + ] + }, + { + "event": "fly", + "phrases": [ + "a housefly buzzes briefly", + "a housefly buzzes around", + "a housefly buzzing sound", + "housefly", + "a housefly is buzzing", + "a housefly", + "housefly noise", + "housefly sound", + "fly sound", + "a fly buzzes around", + "a fly buzzes by", + "a fly buzzing", + "A faint buzzing sound marked the presence of a fly", + "The sharp, high-pitched buzz of a fly was clearly audible", + "The rhythmic droning of a fly added a subtle background noise", + "The soft fluttering of fly wings created a delicate vibration in the air", + "The persistent buzzing noise of a fly moved erratically around" + ] + }, + { + "event": "frog", + "phrases": [ + "frog", + "a frog croaks several times in a row", + "frog is making pulses", + "frog croaks", + "the sounds of a frog", + "a frog croaking repeatedly without stopping", + "a frog consistently croaks", + "a frog croaks seven times in the foreground", + "frog sounds", + "a frog croaking sharply several times", + "frog croaks twice", + "a frog croaking at regular intervals", + "a frog croaks multiple times", + "frog calls", + "frog making croaking sound", + "a frog chirps monotonously", + "a frog croaks several times", + "frog croaks twice again", + "the sound of a frog", + "a frog is chirping over and over", + "a frog is croaking multiple times", + "multiple frogs croak together", + "a frog repeatedly croaks", + "a frog croaks intensively", + "a frog continuously croaks at a fast pace", + "multiple frogs croak repeatedly", + "a frog croaks continuously", + "a frog noise from a toy", + "a frog", + "several frogs croak in rapid succession", + "A deep, guttural croak came from the frog", + "The rhythmic croaking of frogs filled the air", + "A sharp, repetitive frog call echoed in the distance", + "The faint chirping of tree frogs added a natural ambiance", + "The distinct croak of a bullfrog was clearly audible" + ] + }, + { + "event": "frying_(food)", + "phrases": [ + "frying food", + "food frying", + "frying of food", + "someone is cooking food in a deep-fat fryer", + "oil frying", + "steak frying", + "someone is frying sausage on a cast iron", + "frying", + "sausage is being fried on cast iron", + "foods being fried", + "something frying and crackling and sizzling the whole time", + "grease is frying in a skillet", + "frying food in a wok", + "food cooking", + "food being fried", + "someone is frying sausages on a stove", + "the sound of cooking food in oil or another fat", + "frying foods", + "a pan of food is frying on the fire", + "loud frying of food", + "a loud sizzling of food frying", + "steak is being fried in oil", + "a food item is frying and sizzling", + "louder and more vigorous frying sound", + "food begins sizzles while frying", + "cooking food sizzling", + "mixing of sizzling food", + "mushrooms are being fried at low temperature", + "the sizzling of frying food", + "food is frying with sizzling noises", + "A loud, sizzling sound came from the hot oil in the pan", + "The rhythmic crackling noise of frying food filled the kitchen", + "A sharp hissing sound accompanied the food as it cooked", + "The faint bubbling noise of frying oil added to the cooking ambiance", + "The persistent sizzling of frying food created an energetic soundscape" + ] + }, + { + "event": "giggle", + "phrases": [ + "giggle", + "a young boy making a fake giggle", + "a female giggle", + "a giggle", + "a young female giggles in the foreground", + "a young woman's crazy cute giggle", + "a female giggles", + "an adult female gags", + "a small girlish giggle", + "cartoon characters are giggling", + "a kid laughing comically", + "females giggle", + "giggling", + "a girl laughs hysterically", + "a cartoonish voice laughing", + "an adult female pretending to sneeze", + "cartoon characters laugh", + "a short giggle", + "giggles", + "an adult female giggles", + "a female and male giggle", + "a clown laugh", + "a silly laugh", + "a hysterical laugh of a small child", + "an adult female laugh", + "laughter of a woman", + "giggle sounds", + "an adult female laughs in the foreground", + "a child laughing in response", + "A soft, high-pitched giggle broke the silence", + "The faint, repetitive giggling of someone was audible nearby", + "A cheerful, melodic giggle filled the room with joy", + "The rhythmic giggling of a nervous person created a playful tone", + "The distinct sound of a giggle was unmistakable" + ] + }, + { + "event": "glass_shatter", + "phrases": [ + "glasses shatter", + "glasses breaking", + "shatter", + "a glass breaks", + "a glass plate is shattering on the floor", + "glass shatter", + "something heavy shatters a glassy material with deep reverb", + "glass breaks", + "glass to shatter", + "glass breaking sound", + "a glass shatter sound", + "glass shatters", + "glass shattering", + "a glass shattering sound", + "glass hitting pavement and shattering", + "glass breaking", + "a glass shatter", + "a glass shattering", + "glass shatters twice", + "glass broken on the floor", + "a series of glass shattering", + "a glass breaking", + "a glass jar is breaking on a tile floor", + "glass smashes", + "a shattering glass sound", + "people are shattering glass", + "glass shatter in the background", + "glass crashing", + "glass shatters loudly", + "glass shatters and breaks", + "A sharp, high-pitched sound marked the shattering of glass", + "The loud, cracking noise of glass breaking filled the air", + "A faint tinkling sound followed the initial shatter", + "The distinct sound of glass shattering was unmistakable", + "The echo of breaking glass lingered briefly in the room" + ] + }, + { + "event": "goat", + "phrases": [ + "goat", + "goats", + "goat bleeting", + "goats sounding", + "goat clears throat", + "goats bey", + "goat noise", + "goats baa", + "livestock bleat", + "goats footsteps", + "goats rustle", + "goat cries", + "goat bleat", + "goats baaing", + "goat sounds", + "goat bleats", + "goat noises", + "goat bleating", + "goats walk around", + "A soft, repetitive bleating sound came from the goat", + "The loud, rhythmic bleat of a goat filled the air", + "A faint, high-pitched bleating noise was audible nearby", + "The distinct call of a goat created a rural ambiance", + "The soft, low-pitched bleating of a goat added to the natural soundscape" + ] + }, + { + "event": "groan", + "phrases": [ + "groan", + "groans", + "a man deeply groans", + "moan", + "terrible monster groaning in pain", + "a strange creature from the abyss is making a weird groaning growly sound", + "a monster is groaning", + "a deep groan", + "customized hippo groan", + "a groan", + "animals groaning", + "a zombie is groaning", + "a single zombie groan", + "a groan repeated multiple times", + "a low groan is repeated several times", + "men groan", + "a male groans", + "groaning", + "a deep sub groan", + "A low, guttural groan of pain broke the silence", + "The faint, muffled groaning of someone was audible nearby", + "A sharp, high-pitched groan indicated disapproval", + "The rhythmic groaning of effort filled the room", + "The distinct sound of a groan conveyed discomfort" + ] + }, + { + "event": "growling", + "phrases": [ + "growling", + "growls", + "growling animals", + "growling dog", + "growling sounds", + "animal growling", + "a growling sound effect", + "growling intermittently", + "a growling animal", + "growl effect", + "growling noises", + "a deep growling voice", + "a softer version of a growl loop", + "a rhythmic roar", + "growling sounds with voice effects", + "a roar", + "a growling creature", + "low growling", + "The deep growling sound echoed in the distance", + "There was a guttural growl, warning of potential danger", + "The growling noise carried a sense of threat and aggression", + "Low growling sounds could be heard, signaling anger", + "The growl reverberated, creating an ominous atmosphere" + ] + }, + { + "event": "grunt", + "phrases": [ + "grunt", + "a grunt", + "an adult male grunting with exertion", + "painful male grunt sounds", + "a man grunts in a video game", + "painful male hurt sounds", + "a grunt from a man", + "a deep grunt", + "a man deeply grunts", + "A short grunt broke the silence", + "Grunting sounds were heard in the background", + "The grunt was low and rough, almost animalistic", + "Short, abrupt grunts punctuated the air", + "A deep grunt hinted at exertion or irritation" + ] + }, + { + "event": "gunshot", + "phrases": [ + "gunshot", + "gun shot", + "rifle shot", + "gunfire", + "shots", + "gun shoot", + "gunfires", + "shotsgun shots", + "shots sounds with a reverb effect", + "gun fire", + "gunfire with echoes", + "gun shots", + "gun fires", + "gunshot sounds", + "gunfire shots", + "gunshots", + "gunshot sound effects", + "shooting", + "gunshot pops", + "rifle shooting", + "a sniper rifle firing shot sound effect", + "a shot gun is blasting loudly with some reverb", + "loud arbalette shot", + "a gun shot", + "a shotgun is being fired on a training range", + "gun shots with echoes", + "gunfire echoes", + "gunshots echo", + "shots sounds from a pheasant shoot", + "gun shot noise", + "A loud gunshot pierced the air", + "Gunshots echoed sharply in the distance", + "The crack of a gunshot was sudden and startling", + "Multiple gunshots rang out in quick succession", + "The sharp report of a gunshot reverberated briefly" + ] + }, + { + "event": "gurgling", + "phrases": [ + "gurgling", + "gurgle", + "gargling", + "gargle", + "gurgling liquid", + "low gurgling", + "gurgles", + "low pitched gurgling", + "gurgling underwater", + "water gurgling", + "gurgling water", + "gargling with water", + "water gurgling repeatedly", + "gargling sounds", + "liquid gurgling", + "gurgling down a drain", + "gurgling water pouring", + "loud gurgling of water", + "water gurgling continuously", + "gurgling of water", + "water continuously gurgling", + "water gurgling vigorously", + "A gurgling sound of water was heard flowing steadily", + "The bubbling gurgle grew louder as the liquid poured", + "Gurgling noises came from the narrow stream nearby", + "The gurgling sound resembled water passing through a constriction", + "The rhythmic gurgling of the liquid was soothing" + ] + }, + { + "event": "hammer", + "phrases": [ + "hammer", + "mechanisms hammer", + "a hammer hammers", + "hammering", + "mono jackhammer", + "a jackhammer pounds", + "machines hammer", + "hammer being used", + "a hammer strikes", + "a hammer pounds a hard surface", + "a hammer", + "a person hammers a solid object", + "striking with a hammer", + "a hammer hits", + "hammer sounds", + "tools hammering", + "hammering several times", + "a hammer pounding", + "a hammer pounds repeatedly", + "a series of loud metal hammering", + "hammering with surface contact", + "a man uses a hammer", + "hammer sounds", + "a hammering", + "a person hammers on a surface", + "hammer tap", + "someone is using a hammer on a construction site", + "hammering an object", + "The rhythmic hammering sound echoed through the workshop", + "A series of sharp hammer blows could be heard", + "The hammering noise was consistent and deliberate", + "The metallic clang of the hammer striking was distinct", + "Hammering sounds punctuated the otherwise quiet environment" + ] + }, + { + "event": "helicopter", + "phrases": [ + "helicopter", + "helicopter rotor", + "helicopter sound", + "helicopter sounds", + "helicopter moving away", + "helicopter blades", + "quadcopter", + "quadcopter noise", + "helicopter engine", + "propeller sound of a helicopter", + "propeller sounds", + "helicopters start flying", + "helicopter flying away", + "helicopter rotors", + "propeller airscrew sounds", + "helicopter taking off", + "electronic helicopter sound", + "propeller noise", + "propellers turn", + "a propeller sound", + "helicopter flying", + "The distinct whirring of helicopter blades filled the air", + "A low-pitched chopping sound indicated a helicopter nearby", + "The helicopter's rotor noise grew louder as it approached", + "The steady beat of the rotor blades was unmistakable", + "Helicopter sounds hovered persistently in the background" + ] + }, + { + "event": "hiccup", + "phrases": [ + "hiccup", + "hiccupping", + "a hiccup sound", + "hiccuping twice", + "hiccuping", + "baby hiccup", + "hiccups", + "hiccup sounds", + "their hiccup sounds", + "an adult pretending to hiccup", + "a woman hiccup", + "a hiccup occurs", + "a woman hiccupping", + "hiccup in the background", + "women hiccup", + "male hiccuping", + "a man hiccupping", + "a series of hiccups", + "baby hiccuping", + "a hiccup", + "people hiccup", + "a person hiccups multiple times", + "hiccupping in the background", + "man hiccuping", + "hiccup sounds in the background", + "she hiccups", + "hiccupping sounds", + "people are hiccupping in the background noise", + "people are hiccupping", + "the woman hiccups", + "A soft hiccup broke the silence momentarily", + "The repetitive sound of hiccups echoed lightly", + "Each hiccup was followed by a small pause", + "The hiccup sound was short and involuntary", + "A rhythmic series of hiccups could be heard faintly" + ] + }, + { + "event": "hiss", + "phrases": [ + "hiss", + "hisses", + "hissing", + "hissing spray", + "steam hiss", + "compressed air hisses", + "compressed air hissing", + "a quick powerful hiss", + "compressed air releasing", + "repetitive hissing", + "compressed air", + "A long hiss could be heard, sharp and steady", + "The hissing sound grew louder and more pronounced", + "A sudden hiss broke the stillness of the room", + "The hiss was continuous and high-pitched", + "A faint hissing noise persisted in the background" + ] + }, + { + "event": "horse", + "phrases": [ + "horses", + "a horse makes a sound", + "horse sounds", + "a horse's sounds", + "a horse slows and stops", + "horse walking", + "a horse trollops", + "horse noise", + "a horse speeds down a path", + "a series of horse sounds", + "horses sound", + "horse breaths", + "a horse stops", + "horse exhaling", + "horses walk", + "horses tap", + "a horse", + "horse noises", + "horse snorts", + "noise from a horse", + "horse running", + "the sounds of horses outside", + "hooves of horses", + "The horse's neigh resonated loudly", + "A soft snort from a horse broke the calm", + "The rhythmic clopping of hooves was unmistakable", + "A low whinny came from somewhere nearby", + "The horse's breath was audible as it exhaled deeply" + ] + }, + { + "event": "howl", + "phrases": [ + "howls", + "dog howling", + "a dog is howling", + "animal howling loudly", + "howling sound", + "many dogs howling at the same time", + "an animal howls", + "a dog is howling", + "dog howling sound", + "a loud howl", + "wolves howl", + "howling noises", + "A long, mournful howl echoed through the night", + "The howling sound was eerie and distant", + "A series of howls broke the otherwise quiet surroundings", + "The plaintive howl carried an air of loneliness", + "Howling noises seemed to come from all directions" + ] + }, + { + "event": "idling", + "phrases": [ + "idling", + "idling sounds", + "idling of engine", + "idling car", + "idling increases", + "idling engine", + "idling noise", + "idling vehicle motor", + "idling at a constant speed", + "an engine that is idling", + "idling of an engine", + "a machine resembling a vehicle makes sounds of an engine idling", + "idling car engine", + "an engine tapping and idling", + "idling vehicle", + "an idling machine", + "its engine idles", + "clicking from an idling engine", + "idling engines", + "The engine idled with a steady hum", + "A low idling sound filled the air", + "The idling engine produced a consistent, rhythmic noise", + "A faint idling sound could be heard in the background", + "The sound of the engine idling was calm and subdued" + ] + }, + { + "event": "jackhammer", + "phrases": [ + "jackhammer", + "jackhammering", + "mono jackhammer", + "a jackhammer pounds repeatedly", + "a jack-hammer is breaking up a concrete slab", + "a jackhammer runs", + "a jackhammer operating", + "a jackhammer in use", + "a jackhammer creates a loud and constant pounding noise", + "a jackhammer in operation", + "a jackhammer runs continously", + "a jackhammer operating then slowing down before operating at a normal rate again", + "someone is using a jackhammer to break concrete", + "the sound of a jackhammer", + "a jackhammer pounds", + "a jack-hammer is being used in a factory", + "workers are using a jackhammer", + "a jackhammer drilling and vibrating continuously", + "a jackhammer", + "a jackhammer operates", + "a jackhammer is loudly breaking concrete", + "a jackhammer operates with mechanical sounds", + "a man uses a jackhammer", + "a jackhammer pounds a hard surface", + "real sound of a jackhammer", + "a jackhammer runs continuously", + "machines hammer", + "a jackhammer drilling", + "The jackhammer's rapid pounding echoed loudly", + "A sharp, rhythmic jackhammer sound filled the air", + "The jackhammer noise was relentless and intense", + "The pounding of the jackhammer was unmistakable", + "The sound of the jackhammer carried through the surroundings" + ] + }, + { + "event": "jet_engine", + "phrases": [ + "a jet engine runs", + "aircraft jet engine", + "a jet engine roars", + "humming of a nearby jet engine", + "jet engine sounds", + "jet engine of an aircraft running", + "a jet engine runs and hisses", + "a jet engine hisses", + "jet engine", + "a jet engine whirs loudly", + "jet engine operating", + "a jet engine is operating and accelerating", + "jet engine flying", + "jet engine hums", + "steady jet engine running", + "a jet engine runs steadily", + "whooshing from a jet engine", + "a jet engine works nearby", + "a jet engine whirring sharply", + "a jet engine screams", + "The roar of a jet engine filled the atmosphere", + "A loud jet engine sound dominated the surroundings", + "The jet engine emitted a powerful, continuous noise", + "The sound of the jet engine grew louder as it neared", + "A deep, rumbling jet engine noise was audible" + ] + }, + { + "event": "knock", + "phrases": [ + "knock", + "knocks", + "knocking", + "knocking on door", + "knocking on a door", + "knock sounds", + "knocking on an office door", + "knocking on a window", + "door knocking", + "knocks on a wooden door", + "knocking in a wooden door", + "knocking on wooden door from inside", + "knocking on wood", + "pounding on a door", + "a series of knocks", + "knocking repeats numerous times", + "knocking on wood like a door", + "knocking on a glass surface", + "several knocks on a house door", + "knocking on a hollow wooden surface", + "a knocking on a door", + "a series of knocking", + "knocking repeatedly", + "a knocking door", + "a person knocks on a door", + "a knock", + "a series of door knocking", + "a knock on a door", + "knocking sounds", + "a loud knock on a door", + "A short knock echoed through the space", + "Knocking sounds came in a steady rhythm", + "The sharp knock was deliberate and clear", + "A faint knock could be heard from the distance", + "The knocking noise was distinct against the silence" + ] + }, + { + "event": "laughter", + "phrases": [ + "laughter", + "laugh", + "laughing", + "audience laughs", + "wicked laughter", + "laughter continues", + "audience laugh", + "laughs", + "audience laughing", + "short laughter", + "laughter from two or more persons", + "laughter on a good level", + "laughter from the man", + "the audience laughs", + "audience laughter", + "laughters", + "the audience laughing", + "people belly laugh", + "tap dancers laugh", + "a laughter", + "a laughter sample", + "belly laugh", + "a lot of belly laughter", + "canned laughter", + "an audience laughs", + "laughter from crowd", + "short laughter from an audience", + "The sound of laughter rang out joyfully", + "A burst of laughter broke the silence", + "The rhythmic sound of laughter filled the space", + "Laughter echoed, spreading a sense of cheer", + "A soft chuckle turned into hearty laughter" + ] + }, + { + "event": "lawn_mower", + "phrases": [ + "lawn mower", + "lawn mowers", + "lawnmower engine", + "lawn mowers start", + "lawn mowers run", + "the sounds of a lawn mower", + "lawn mowing", + "a lawn mower engine running then powering down", + "a lawn mower shifts gears and accelerates", + "lawn mower operates", + "lawn mower runs", + "a ride-on-lawnmower is being recorded", + "loud noise of turning on and off a lawnmower", + "lawn mower with engine starting", + "tree chipper sound effect", + "a lawnmower starts and stops", + "someone is cutting grass with an electric mower", + "a lawn mower is running with engine starting", + "a lawn mower runs and stops", + "a lawn mower operates", + "someone is cutting grass with a motor mower", + "lawn mower riding past a dog", + "a lawnmower putters out", + "the loud onset of a lawn mower engine idling", + "lawn mower sounds", + "lawnmower type engine sound that grows stronger then fades off", + "a lawnmower engine is started multiple times", + "a lawn mower engine running", + "a lawn mower running steadily for some time", + "The steady hum of the lawn mower filled the yard", + "A loud buzzing noise came from the spinning blades of the mower", + "The lawn mower's engine produced a rhythmic droning sound", + "The whirring of the mower grew louder as it passed nearby", + "A persistent mowing sound resonated throughout the area" + ] + }, + { + "event": "liquid", + "phrases": [ + "liquid", + "liquids pour", + "water pour", + "pouring liquids", + "a liquid", + "a liquid pours", + "liquid sounds", + "water", + "liquid is pouring into a pitcher", + "the sound of liquids", + "the sounds of liquid", + "liquids pour and splash", + "liquid pours", + "pouring liquid into container", + "water pouring sounds", + "liquids pouring", + "the sounds of liquid and water", + "spilling water", + "pouring liquid", + "liquid pouring", + "liquids are poured and slosh around", + "liquid filling sounds", + "someone is pouring a glass of water down a sink", + "liquid is trickled and dribbled into water", + "pours liquid", + "liquid is being poured into another liquid", + "water being poured into a pitcher", + "someone is pouring water", + "The liquid splashed softly as it was poured", + "A faint dripping sound signaled the presence of liquid", + "The gurgling of liquid could be heard as it flowed steadily", + "The bubbling sound of liquid indicated gentle motion", + "A sloshing noise came from the container as the liquid shifted" + ] + }, + { + "event": "machine_gun", + "phrases": [ + "machine guns sound", + "machine gun sound", + "sound of a machine gun", + "machine gun sounds", + "machine gunfire sound", + "machine gun fires", + "machine guns fire", + "a short series of machine gunfire", + "machine gunshots sound", + "machine gun fire occurs", + "a machine gun is fired in short bursts", + "fires a machine gun", + "machine gun shooting", + "machine gun fires", + "machine guns shot sound", + "machine gunfire rings out", + "machine gun violence", + "loud machine gun sound", + "a light machine gun is firing a single burst", + "light machine gun sound", + "a machine gun is fired", + "machine guns are operating", + "powerful machine gun is being fired", + "machine gun fire rings out", + "Rapid machine gun fire echoed sharply", + "The staccato burst of machine gun bullets was unmistakable", + "A steady rattle of machine gun fire filled the air", + "The loud, repetitive sound of a machine gun dominated the scene", + "The sharp report of a machine gun firing in rapid succession was startling" + ] + }, + { + "event": "male_singing", + "phrases": [ + "a male voice sings", + "a man sings a mantra", + "a man sings a song", + "a male singer sings", + "a male singer sing", + "male voices sing a song", + "a male singer performs", + "a male voice sings multiple times", + "a male sing", + "a male voice sings in a repeating sequence", + "a male sings", + "singing a line", + "singing with a male voice", + "singing by a male", + "an adult male sings", + "singing from a male", + "a male singing voice heard intermittently", + "a man is singing a song", + "a male singer", + "a male singing voice", + "a male singer is performing", + "a man sings continuously throughout the track", + "a young adult male sings", + "male singer performs", + "singing with male vocals", + "a male sings intermittently", + "a male voice chanting", + "a man sings multiple songs", + "a man take turns singing", + "The deep tones of male singing resonated warmly", + "A melody sung by a male voice carried through the air", + "The rich timbre of the male singing voice was captivating", + "The male voice sang with a clear and resonant tone", + "A harmonious male vocal performance filled the space" + ] + }, + { + "event": "mechanical_bell", + "phrases": [ + "a mechanical bell ringer", + "a mechanical bell", + "a mechanical bell rings", + "a mechanical bell is ringing", + "mechanical bells", + "mechanical bell sound", + "The mechanical bell rang with a clear, metallic tone", + "A steady ringing sound came from the mechanical bell", + "The distinct chime of the mechanical bell was heard", + "The rhythmic clanging of the mechanical bell filled the air", + "A sharp, repeated dinging sound indicated the mechanical bell" + ] + }, + { + "event": "mechanical_fan", + "phrases": [ + "a mechanical fan blowing", + "a mechanical fan", + "mechanical fan", + "fan", + "a mechanical fan runs", + "mechanical fan noise", + "fan sounds", + "The mechanical fan emitted a consistent whirring noise", + "A soft humming sound came from the rotating blades of the fan", + "The fan's motor produced a steady droning sound", + "The rhythmic whooshing of air from the fan was audible", + "A faint mechanical hum indicated the operation of the fan" + ] + }, + { + "event": "medium_engine_(mid_frequency)", + "phrases": [ + "medium engine sound", + "mid-frequency medium engine", + "noise coming from a medium engine", + "a medium engine runs", + "a medium engine revving", + "quick revving of a medium engine", + "a medium engine hums", + "a medium engine accelerating", + "a medium engine making revving sounds", + "a medium engine roars", + "a medium engine revs and squeals", + "a medium engine sound", + "a medium engine revs and accelerates", + "medium engine noise", + "a medium engine makes noise", + "a medium engine is making mid frequency sounds", + "a medium engine sounds", + "a medium engine revs", + "the sound of a medium engine", + "The engine emitted a steady mid-frequency hum", + "A moderate droning sound came from the engine at idle", + "The medium engine produced a deep, consistent noise", + "A rhythmic purring noise was heard from the engine", + "The engine's sound was neither too high nor too low in pitch" + ] + }, + { + "event": "meow", + "phrases": [ + "meow", + "meows", + "cat meow", + "a loud meow", + "meowing sound", + "meowing", + "a meow", + "a series of meows", + "a cat meows three times", + "a series of meowing", + "cat meowing 3 times", + "a pet cat meows two times", + "cat sounds", + "a cat meow sound", + "three short meows", + "the meow of a cat", + "a cat makes an anguished meow", + "a cat meows sharply", + "kitten meowing", + "a cat meows alternately", + "a cat meows loudly two times", + "a car meows", + "a cat continuously meows", + "cat meowing", + "a cat meows angrily", + "A cat's meow echoed softly", + "The meowing sound was tonal and clear", + "A plaintive meow was heard nearby", + "The cat's meow was short and distinct", + "A series of soft meows indicated the presence of a cat" + ] + }, + { + "event": "mosquito", + "phrases": [ + "mosquito", + "mosquito is buzzing close up", + "a mosquito sound created with a synth", + "a mosquito buzzing", + "an insect buzzing at a high pitch tone continuously", + "a mosquito buzz", + "a housefly buzzes by loudly nearby", + "a mosquito buzzes", + "the humming of a mosquito", + "random buzzing of an insect varying in loudness", + "buzzing mosquito flying", + "the sound of a mosquito", + "mosquitos", + "a mosquito", + "a mosquito buzzes in the background", + "mosquitoes buzz in the foreground", + "a mosquito buzzing in the background", + "the buzzing of a flying mosquito", + "a mosquito flying", + "a mosquito buzzes nearby", + "an mosquito buzzes around continuously", + "a mosquito chirping in the background", + "a synthetic sound of buzzing mosquitos or bees", + "a mosquito sound", + "The high-pitched whine of a mosquito was audible", + "A faint buzzing noise indicated the presence of a mosquito", + "The mosquito's sound was sharp and persistent", + "A buzzing sound hovered nearby, characteristic of a mosquito", + "The mosquito's whine grew louder as it approached" + ] + }, + { + "event": "motorboat", + "phrases": [ + "motorboat", + "speedboat", + "motorboats", + "motorboat motor", + "the sounds of a motorboat", + "electronic motorboat engine", + "motorboat engine", + "a motorboat engine is running loud and fast", + "the boat motor advances", + "a motorboat", + "a motorboat engine runs continuously", + "the sound of a motorboat", + "motorboats rev up rapidly", + "a motorboat engine reduces to a slower and quieter pace", + "a motorboat engine revving continuously", + "engine boat revving", + "a motorboat's sounds", + "a speedboat engine running on and off", + "boat motor", + "speedboat engine run loudly", + "a boat motor is running with increasing frequency", + "motorboat engine acceleration", + "a motor revs loudly and then decreases", + "a motorboat speeds up even more", + "a motorboat engine vibrates loudly nearby", + "The motorboat's engine produced a steady droning noise", + "A rhythmic chugging sound came from the motorboat", + "The motorboat's engine roared as it accelerated", + "A deep rumbling noise was heard from the motorboat", + "The motorboat's sound was distinct and mechanical" + ] + }, + { + "event": "motorcycle", + "phrases": [ + "motorcycle", + "motorcycle engine", + "motorcycles rev", + "motorcycle revving", + "motorcycle vehicle revving down", + "motorcycle engine revving up", + "motorcycle reving loudly", + "motorcycle revving engine", + "motorcycle engine revving", + "a motorcycle revs up", + "motorcycle pass", + "motorcycle revs quickly", + "a motorcycle engine accelerates quickly", + "motorcycle engines accelerate", + "motorcycles acceleration", + "a motorcycle engine revs up", + "motorcycles accelerate", + "revving sounds of a motorcycle", + "a motorcycle quickly accelerates", + "motorcycle engine accelerates", + "a motorcycle engine revving sharply", + "motorcycles", + "a motorcycle revs up loudly", + "motorcycle engines rev", + "revving motorcycles", + "loud accelerating motorcycle", + "motor bike engine revving", + "motor revving", + "a motorcycle engine decelerates", + "a motorcycle engine roars", + "The motorcycle's engine emitted a sharp roaring noise", + "A steady revving sound came from the motorcycle", + "The motorcycle's engine produced a mid-frequency drone", + "A rhythmic rumble indicated the presence of a motorcycle", + "The motorcycle's sound grew louder as it passed by" + ] + }, + { + "event": "music", + "phrases": [ + "music", + "playing music", + "the music", + "a variety of music", + "a mix of music", + "the playing of music", + "background of music", + "a music playing", + "a background of music", + "a music soundtrack", + "a music", + "a group of people listen to music", + "a music track plays", + "piece of music", + "a mixture of music", + "a piece of music", + "a piece of music plays", + "music playing", + "the sound of music", + "a music played", + "a musical track", + "music in play", + "music in the background", + "A melodic tune resonated beautifully", + "The rhythmic sound of music filled the air", + "Harmonious notes blended seamlessly in the music", + "The music's sound was soothing and pleasant", + "An instrumental melody played softly in the background" + ] + }, + { + "event": "neigh", + "phrases": [ + "neigh", + "a horse neighs", + "horses neigh", + "horse neighing", + "a horse neighs nearby", + "a horse neighs loudly", + "a horse neighs wildly", + "horses are neighing", + "a horse letting out a neigh", + "a neigh", + "horse neighing sound", + "A clear neigh echoed through the surroundings", + "The horse's high-pitched neigh broke the silence", + "A series of short neighs was heard nearby", + "The neighing sound carried a sense of urgency", + "A loud neigh resonated across the field" + ] + }, + { + "event": "ocean", + "phrases": [ + "ocean", + "surf waves", + "ocean waves", + "the ocean", + "seawash", + "surf", + "waves (surf)", + "ocean waves ebb and flow", + "the waves", + "waves", + "an ocean", + "the sounds of the ocean", + "the sounds of surf", + "ocean currents", + "ocean waves repeatedly crash", + "ocean waves are moving at a moderate pace", + "ocean waves break", + "waves break", + "ocean waves are breaking and crashing onto shore", + "the sounds of waves", + "ocean waves are repeatedly splashing on shore", + "the ocean waves", + "waves crashing onto shore continuously", + "the ocean waves are hitting the shore at a moderate pace", + "waves are continuously washing onto shore", + "waves crashing continuously onto shore", + "the sound of the ocean waves", + "close ocean waves", + "ocean ambience", + "The soothing sound of ocean waves was constant", + "A rhythmic crashing of waves echoed along the shore", + "The gentle lapping of water indicated a calm ocean", + "A deep, rolling wave sound was heard from the ocean", + "The ocean's sound was vast and ever-present" + ] + }, + { + "event": "oink", + "phrases": [ + "oink", + "oinking", + "an oink", + "oinks", + "animal sounds such as oink, oink", + "brief oinking", + "animal oinking", + "a pig oinks", + "pig oinking", + "a pig making oink", + "animal oink sounds", + "rapid oinking", + "oinking pig", + "rhythmic oinking", + "a pig oinks", + "small oinks from a pig", + "a pig oinking", + "a pig oink sound", + "a pig oink", + "an animal oinks", + "a pig makes oink sounds", + "a pig is making an oinking sound", + "A loud oink broke the silence", + "The characteristic oinking of a pig was heard", + "A rhythmic series of oinks came from nearby", + "The deep oink carried a sense of contentment", + "A soft, short oink was heard intermittently" + ] + }, + { + "event": "owl", + "phrases": [ + "owl hooting", + "owl sound", + "an owl hooting", + "an owl sound effect", + "an owl is being recorded", + "a hoot owl making a sound", + "an owl hoots", + "an owl vocalizes", + "an owl whistles", + "a whistling owl calls out", + "an owl is making hooting sounds", + "an owl sound effect", + "A soft hooting sound came from an owl", + "The owl's call echoed eerily in the night", + "A rhythmic hoot was heard repeatedly", + "A low, resonant hoot indicated an owl nearby", + "The owl's sound was distinct against the silence" + ] + }, + { + "event": "paper_rustling", + "phrases": [ + "papers rustling", + "paper rustling sounds", + "sounds of paper rustling", + "rustling sound", + "rustle paper", + "paper sounds", + "paper rustling and crumpling", + "paper rustle", + "The soft rustling of paper was audible", + "A faint crinkling noise came from handling the paper", + "The sound of paper rustling was brief but clear", + "A gentle rustle indicated someone turning pages", + "The crisp sound of paper being folded could be heard" + ] + }, + { + "event": "pig", + "phrases": [ + "pig oinks", + "pigs oink", + "pigs oinks", + "pig oinking", + "pigs", + "pig grunts", + "pigs grunt", + "pigs oinking", + "pigs crow", + "a pig sound", + "pigs intermittently oinking", + "pigs squeal", + "A loud grunt came from the pig", + "The pig's snorting was steady and rhythmic", + "A series of low grunting noises indicated the pig's presence", + "The pig's vocalizations were deep and guttural", + "A soft, contented snuffle was heard from the pig" + ] + }, + { + "event": "plop", + "phrases": [ + "plop", + "plop plop sound", + "plops", + "plopping sounds", + "a plop noise", + "a plopping sound effect", + "a single plop", + "A small plop was heard as something dropped into water", + "The plopping sound was soft and quick", + "A faint plop echoed briefly in the stillness", + "The sound of a plop indicated a small object entering liquid", + "A single plop broke the silence momentarily" + ] + }, + { + "event": "police_car_(siren)", + "phrases": [ + "a police car siren blares, then stops, and afterward blares again", + "a police siren wails once before fading in the distance", + "an police car siren", + "police car sirens ring in rapid succession", + "an police car blares its siren", + "a police car siren goes off and continues awhile", + "a police car siren sounds and then stops and restarts again", + "police car sirens blaring in succession", + "police car sirens blare in a series", + "a couple of police car sirens blaring one after the other", + "an police cariren wails in different patterns", + "police car sirens ring as they approach", + "a police siren rings in different patterns", + "continuous police car siren becoming rapid", + "a police car siren sounds", + "an police car siren sounding off continuously", + "an police vehicle siren wails and echoes", + "a police car siren sounds in different patterns continuously", + "The wailing siren of a police car grew louder", + "A sharp, oscillating siren sound filled the air", + "The police car's siren echoed through the streets", + "A high-pitched siren noise was unmistakable", + "The sound of the police siren faded into the distance" + ] + }, + { + "event": "power_saw", + "phrases": [ + "circular saw", + "power saw", + "power saws", + "table saw", + "circular saw is being recorded", + "a circular saw in operation", + "a circular saw is being used", + "a circular saw runs", + "electric saw cutting", + "a power saw running", + "a power saw cutting some objects", + "a power saw makes a cutting sound", + "sound of table saw", + "a power saw cuts an object", + "a loud power sawing", + "a power saw turns on and runs", + "a power saw runs", + "power saw sounds", + "mechanical saw sawing", + "sound of a power saw", + "a power saw cutting", + "a man uses a power saw", + "large power sawing", + "power tools saw", + "a power saw cuts", + "The sharp buzzing of a power saw was heard cutting through material", + "A high-pitched whirring noise came from the power saw", + "The power saw's motor emitted a steady droning sound", + "The power saw produced a harsh grinding noise as it worked", + "A rhythmic sawing sound indicated continuous operation of the power saw" + ] + }, + { + "event": "power_tool", + "phrases": [ + "power tool sound", + "a soft power tool drilling", + "power tools run", + "a man is using a power tool", + "a power tool making drilling noises", + "vibrations from a power tool", + "a power tool running", + "ringing of a power tool", + "the sound of a power tool spinning", + "a power tool runs continuously", + "power tools drill through materials", + "power tools make noise", + "power tools buzz", + "power tool sounds", + "power tools are at work", + "power tools are being used", + "a power tool buzzes", + "power tools cut", + "The power tool emitted a loud, mechanical buzzing sound", + "A steady whir came from the power tool as it operated", + "The motorized tool produced a rhythmic grinding noise", + "A high-pitched hum indicated the use of the power tool", + "The sound of the power tool was sharp and mechanical" + ] + }, + { + "event": "printer", + "phrases": [ + "printer", + "printer is printing a document", + "printer is printing out a receipt", + "printers", + "printer is printing a piece of paper", + "printers print continuously", + "a printer prints", + "printer turning on", + "a printer is printing a receipt", + "an operating printer", + "a home printer", + "a printer printing", + "a desktop printer operating", + "printer hum", + "a printer operates", + "printers are printing with mechanisms sounds", + "a printer is turning on", + "a printer", + "a printer is printing out a receipt", + "a 3d printer is printing", + "a printer runs", + "a printer works", + "a printer mechanism", + "printer noise", + "gears operating on a printer", + "a 3d printer is in action", + "printer mechanisms", + "the sounds of a printer", + "a printer is scanning a book", + "a printer being turned on", + "The printer emitted a rhythmic whirring and clicking sound", + "A steady hum accompanied the operation of the printer", + "The printer's motor produced a faint, mechanical droning noise", + "A sequence of beeps and printing noises indicated activity", + "The sound of paper feeding added to the printer's operation noise" + ] + }, + { + "event": "propeller", + "phrases": [ + "propeller", + "propellers", + "propeller noise", + "propeller sounds", + "propellers spin", + "prop engine sound", + "a propeller sound", + "propellers twirling", + "air propellers", + "propeller create wind sounds", + "propellers rotating", + "propeller make sounds", + "propeller sounds get loud as it comes close", + "propellers air and buzz", + "drone propellers", + "propeller blades", + "propellers are spinning", + "propeller airscrew sounds", + "a propeller is running", + "The propeller's rhythmic chopping sound filled the air", + "A steady hum came from the spinning propeller", + "The propeller noise grew louder as the blades spun faster", + "The sound of the propeller was deep and mechanical", + "A distinct whirring noise indicated the operation of the propeller" + ] + }, + { + "event": "quack", + "phrases": [ + "quack", + "a duck quacks", + "artificial duck quacks", + "the sound of a duck quacking", + "the sound of a quacking duck", + "a duck quacks rhythmically", + "quacking", + "a duck quacks in rapid succession", + "a duck quacks loudly nearby multiple times", + "duck quacks", + "duck quaking loudly", + "quacks", + "a duck quacks loudly and continuously", + "duck quack", + "a duck quacking repeatedly without breaks", + "a single quack", + "a duck quacks first moderately and then vigorously", + "duck quaking", + "quacking duck", + "a duck repeatedly quacks loudly", + "a duck quacks continuously", + "duck quacking loudly", + "a duck quacking several times", + "duck quacking", + "a mother duck quacks", + "a duck quacks rapidly", + "a duck quacks many times", + "a duck quacks multiple times", + "loud and rapid quacking", + "a duck quacking continuously at consistent intervals", + "A loud quack broke the silence", + "The characteristic quacking noise of a duck was heard nearby", + "A rhythmic series of quacks echoed across the water", + "The duck's quack sounded sharp and distinct", + "A soft quack was heard intermittently" + ] + }, + { + "event": "race_car", + "phrases": [ + "race car engines", + "race car running", + "race car noise", + "race car", + "race car speeding off", + "race car sounds", + "car engine sound in car race", + "a race car runs", + "sounds of a car race", + "race car engine revs", + "auto racing", + "auto racing sound", + "a race car loudly accelerate outside", + "auto racing running", + "a race car accelerates loudly", + "an auto racing passing by", + "The roar of a race car engine echoed loudly", + "A sharp revving sound came from the race car", + "The race car's engine produced a deep, powerful noise", + "A high-pitched whine accompanied the acceleration of the race car", + "The sound of the race car was intense and relentless" + ] + }, + { + "event": "rain", + "phrases": [ + "rain", + "raining", + "rainfall", + "rain falling", + "rain fall", + "rain falls", + "rain on surface", + "rain falls steadily", + "rain falling onto a hard surface", + "raining hard", + "rain falls on surface", + "rain steadily falls", + "rain falling heavily", + "rain is falling and spattering on a surface", + "rain falling heavily on a surface", + "rain is falling hard on a tile floor", + "rain falls onto a surface", + "rain falls onto a street", + "rain falls on a surface very thickly nearby", + "rain falling on the surface", + "rain falls onto a hard surface heavily", + "rain falling and dropping on a surface", + "rain fall heavily", + "rain on surfaces", + "rain falls steadily onto a hard surface", + "rain falls loudly and rapidly on a surface", + "rain falling hard", + "rain falling on a surface", + "rain fall onto a hard surface", + "rain is falling and pattering on a hard surface", + "The gentle patter of rain was soothing to hear", + "A steady rain sound filled the environment", + "The rhythmic dripping of rain was audible", + "A soft rainfall sound created a calming atmosphere", + "The rain's sound was consistent and natural" + ] + }, + { + "event": "rain_on_surface", + "phrases": [ + "pitter-patter of rain", + "rain on surface", + "rain", + "rain is falling hard on a tile floor", + "rain falls heavily onto a hard surface", + "rain is falling and spattering on a surface", + "rain falls onto a hard surface heavily", + "rain falling heavily on a surface", + "rain falling onto a hard surface", + "rain is falling and pattering on a hard surface", + "rain falls heavily on a surface", + "rain fall onto a hard surface", + "rain falls down loudly on a surface", + "rain falling on a hard surface", + "a strong rainfall on a hard surface", + "rain pours heavily on a surface", + "rain falls rapidly on a surface", + "rain falls down rapidly", + "rain fall", + "rain falls on a surface rapidly nearby", + "rain taps", + "rain is falling on a surface hard", + "rain is falling and hitting surfaces", + "raindrops pitter-patter", + "rain falls on a surface very thickly nearby", + "rain is falling hard on a surface", + "rain is falling very very hard onto a surface", + "rain falls very loudly on a surface", + "rain on surfaces", + "raining hard", + "The steady sound of rain striking a roof was constant", + "A rhythmic tapping noise came from rain hitting a window", + "The rain on the surface created a soft splattering sound", + "A persistent pattering of rain was heard on the ground", + "The sound of rain on the surface was soothing and steady" + ] + }, + { + "event": "rattle", + "phrases": [ + "rattle", + "rattle sounds", + "rattles shake", + "a shake", + "a fast rattle", + "a rattle sounds", + "vibrations rattle", + "a rattle shakes", + "rattle sounds are heard intermittently", + "a rattle noise", + "A rapid rattling noise came from loose objects", + "The sound of small items rattling was sharp and consistent", + "A faint rattle echoed from within a container", + "The rattling noise was abrupt and repetitive", + "A series of clattering sounds indicated movement" + ] + }, + { + "event": "reversing_beeps", + "phrases": [ + "reversing beeps are heard", + "reversing beeps sound", + "reversing beeps are processed", + "reversing beeps", + "reversing beeps occur in a short series", + "reversing beep sounds", + "sound of reversing beeps", + "reversing beeps in a mechanical setting", + "beeping sounds repeat", + "a reversing beeps loudly nearby several times", + "beeping inside a room", + "The reversing beep of a vehicle was steady and rhythmic", + "A sharp beeping sound indicated a vehicle in reverse", + "The warning beeps grew louder as the vehicle reversed", + "A repetitive beep alerted pedestrians to a reversing vehicle", + "The sound of reversing beeps was mechanical and consistent" + ] + }, + { + "event": "ringing_tone", + "phrases": [ + "ringing sound", + "ringing tones", + "bell tone", + "ringtone sound", + "ringing sound", + "ringback tone", + "ringing reverb", + "bell sound", + "boing sound", + "A clear ringing tone sounded from a nearby phone", + "The phone's ringing tone was sharp and attention-grabbing", + "A steady beeping noise indicated an incoming call", + "The sound of a ringing tone echoed in the room", + "The synthesized ringing sound was electronic and distinct" + ] + }, + { + "event": "sanding", + "phrases": [ + "sanding", + "sanding a wooden surface", + "sanding wood", + "sanding a solid object", + "sanding on wood", + "sanding and rubbing", + "brushing", + "dry sanding", + "wood sanding", + "sanding and filing", + "someone is sanding", + "rub sanding", + "someone is sanding a piece of wood", + "a series of sanding", + "a person sands an object", + "wood brushing", + "the sounds of sanding", + "sanding of wood", + "wood being sanded", + "some sanding", + "The sound of sanding was rough and consistent", + "A rhythmic scraping noise came from the sanding process", + "The sanding sound grew softer as the surface smoothed out", + "The abrasive sound of sanding was sharp and repetitive", + "A faint scratching noise was heard during sanding" + ] + }, + { + "event": "sawing", + "phrases": [ + "sawing", + "sawing repeats", + "sawing wood", + "sawing a plastic surface", + "a saw sawing", + "sawing noises", + "sawing a solid object", + "sawing of wood", + "sawing occurs", + "a person is sawing a solid object", + "a saw sawing wood", + "sawing and wood sounds", + "industrial saw sawing wood", + "wooden sawing", + "a saw is used on wood", + "sawing something", + "a solid object is sawed", + "metal sawing wood", + "a person is sawing an object", + "sawing of wood products", + "sawing of wood with a hand saw", + "a man uses a saw to cut a solid object", + "sawing of wood is occurring", + "rhythmic metal sawing", + "a tool sawing wood", + "wood sawing", + "a saw cutting a solid object", + "a saw cutting an object", + "sawing a bamboo stick", + "a person saws an object", + "The sawing sound was rhythmic and sharp", + "A steady rasping noise came from the saw cutting through material", + "The sound of sawing grew louder as the blade moved faster", + "A high-pitched noise indicated a motorized saw in use", + "The sawing sound was rough and mechanical" + ] + }, + { + "event": "scrape", + "phrases": [ + "scrape", + "scratch", + "a scraping", + "the sounds of scraping", + "scraping with surface contact", + "a scrape sound", + "a rub", + "rough scraping", + "a scrape", + "A sharp scraping sound was heard as the surface was scratched", + "The sound of scraping was harsh and repetitive", + "A faint scraping noise came from a distant source", + "The scraping sound grew louder as the edge moved across the surface", + "The noise of scraping was abrasive and mechanical" + ] + }, + { + "event": "screaming", + "phrases": [ + "screaming", + "screams", + "wild screaming", + "painful screams", + "screams loudly", + "a person screaming in terror", + "a constant screaming", + "a kid screaming ", + "people are screaming", + "a group of people are screaming wildly", + "people scream in fear", + "she screams", + "someone is desperately screaming", + "a loud screaming", + "human screaming", + "a woman is screaming in terror", + "woman screaming", + "the adult female screams", + "males scream", + "people screaming", + "young child is sustained screaming", + "women screaming", + "human screams", + "a girl screaming in a soundstage", + "a baby screaming in the foreground", + "an adult male screams", + "screaming babies", + "females scream", + "male screaming", + "A loud scream pierced the air", + "The sound of screaming was sharp and high-pitched", + "A series of screams echoed in the distance", + "The scream was sudden and startling", + "A prolonged scream carried a sense of urgency" + ] + }, + { + "event": "sewing_machine", + "phrases": [ + "sewing machines", + "sewing machine", + "sewing machine stitches", + "sewing machine stitching", + "sewing machine operating", + "sewing machine running", + "a sewing machine returns to rapid sewing", + "a sewing machine works at regular intervals", + "a woman taps and clicks with a sewing machine", + "sewing machine being used", + "sewing machine working", + "a sewing machine clinks repetitively before stopping", + "a sewing machine being used", + "sewing machine running", + "sewing machine mechanisms", + "a sewing machine operates several times", + "sewing machine going", + "sewing machines operate intermittently", + "sewing machine runs", + "a sewing machine is used with sewing sounds", + "the sounds of a sewing machine", + "a sewing machine is being use", + "sewing machine clacking", + "a sewing machine operates", + "a sewing machine is making sound", + "short bursts of sewing", + "The sewing machine emitted a steady whirring noise", + "A rhythmic clicking sound came from the sewing machine", + "The motor of the sewing machine produced a faint hum", + "The sound of the sewing machine was quick and mechanical", + "A soft clattering noise indicated the machine was in use" + ] + }, + { + "event": "sheep", + "phrases": [ + "sheep", + "sheep bleat", + "a young sheep bleats", + "sheep baah", + "sheep baa", + "a sheep bleats multiple times", + "a number of sheep bleating continuously", + "sheep baaing", + "several sheep bleating unceasingly", + "older sheep bleating", + "a sheep bleats nearby multiple times", + "young sheep baa", + "the sounds of sheep", + "a sheep bleets", + "sheeps bleat", + "sheep bleats", + "a sheep bleats", + "a sheep goes baa", + "sheep grunt", + "the sheep bleat", + "a sheep beys", + "sheep scream", + "a sheep bleats a couple of times", + "a sheep sporadically bleating", + "sheep bleating sporadically", + "several sheep bah", + "a sheep baas", + "A loud bleat came from a sheep", + "The sheep's bleating was rhythmic and persistent", + "A series of low bleating noises indicated a flock nearby", + "The sound of the sheep was soft and distinct", + "A faint bleat echoed in the distance" + ] + }, + { + "event": "shout", + "phrases": [ + "shout", + "people shout", + "shouts", + "crowd shouting", + "a crowd of people shouting", + "several people shout", + "yelling", + "shouting", + "a man briefly shouting", + "crowd yelling", + "yells", + "shout sounds", + "a guy shouts", + "a man yelling", + "children are yelling in unison", + "people are shouting", + "the sounds of shouting", + "a man yelling", + "someone shouting out", + "a group of people erupt with shouts", + "someone is yelling", + "a loud scream", + "the male screams", + "A loud shout carried over the noise", + "The sound of shouting was clear and deliberate", + "A sharp shout broke the silence", + "The shouting noise was steady and commanding", + "A distant shout could be faintly heard" + ] + }, + { + "event": "shower", + "phrases": [ + "shower", + "shower water", + "showering", + "bathtub sounds", + "shower sounds", + "water tap sound", + "shower running", + "showers", + "water runs into a shower", + "a shower", + "face washing", + "a shower is running loudly", + "shower is running inside a bathroom", + "someone is turning a shower on", + "bathroom shower is spraying water", + "someone is using a shower", + "The steady sound of water spraying from the shower was soothing", + "A rhythmic splashing noise came from the running shower", + "The sound of water hitting the shower floor was distinct", + "A soft hissing noise indicated the spray of the shower", + "The shower's noise was consistent and calming" + ] + }, + { + "event": "shuffling_cards", + "phrases": [ + "playing cards are being riffled and modified", + "people are shuffling cards inside a small room", + "shuffling cards sounds", + "playing cards are being ruffled through", + "cards shuffling", + "shuffling cards", + "cards are being shuffled", + "someone shuffles a deck of cards", + "cards shuffling on a surface", + "cards are shuffling on a hard surface", + "card shuffling sounds", + "someone is shuffling a deck of cards", + "someone is shuffling playing cards", + "shuffling card sounds", + "cards are being riffle shuffled", + "paper shuffles", + "people shuffle cards in a small room", + "cards are shuffled repeatedly", + "cards are being shuffled with the riffle shuffle method", + "people shuffle cards", + "a deck of playing cards is being shuffled", + "The sound of shuffling cards was quick and rhythmic", + "A soft rustling noise came from the deck being shuffled", + "The cards made a faint clicking sound as they were shuffled", + "A rhythmic series of card noises indicated shuffling", + "The shuffling sound was soft but distinct" + ] + }, + { + "event": "sigh", + "phrases": [ + "a heavy sigh", + "an emotional sigh", + "sighing", + "a long sigh", + "a human sigh sound", + "sighing sounds", + "a sigh in the background", + "a sigh", + "a young male sighs", + "sighs", + "a female sigh", + "a person sighing", + "A soft sigh was audible, indicating relief or exhaustion", + "The sound of a sigh broke the silence momentarily", + "A gentle exhalation was heard, resembling a sigh", + "The sigh was deep and carried a sense of weariness", + "A faint, audible sigh signaled contemplation or relief" + ] + }, + { + "event": "sink_(filling_or_washing)", + "phrases": [ + "running tap in wash basin", + "water pouring", + "filling a sink", + "a faucet pours water", + "water flows hard from a faucet into a tub", + "tap water", + "water faucet running water", + "water from the sink and faucet", + "sink filling", + "a water tap runs into a hollow surface", + "kitchen water faucet starting and stopping", + "water flows from a sink and faucet", + "water gushes and fills a sink", + "a water tap is turned on and fills a sink", + "water flowing hard from a faucet in short bursts", + "washing", + "water fills and runs in a sink", + "water is running from a faucet into a sink", + "running water in sink", + "water flowing from a faucet at different intervals", + "water flows from a tap into a bathtub", + "The sound of water splashing into the sink was steady", + "A rhythmic dripping noise came from the filling sink", + "The sound of dishes being washed in the sink was distinct", + "A soft gurgling noise indicated the sink draining water", + "The continuous sound of running water filled the sink" + ] + }, + { + "event": "siren", + "phrases": [ + "siren", + "emergency siren", + "emergency vehicle siren", + "siren sound", + "a siren sounds on an emergency vehicle", + "fire emergency vehicle siren", + "a remix of a siren is playing and looping", + "its siren", + "an emergency siren wails loudly", + "sirens wail in quick succession", + "a siren wails continuously", + "siren from emergency vehicle", + "a siren blaring continuously", + "ambulance siren wail", + "a siren wails loudly continuously", + "emergency vehicle sirens blare", + "emergency vehicle siren blaring", + "an emergency vehicle siren wails continuously", + "a fire engine sounds its siren", + "an emergency siren goes off loudly", + "emergency vehicle siren", + "fast siren", + "emergency vehicle siren blasts", + "police emergency vehicle siren", + "sirens", + "old crank emergency siren", + "sire wails", + "a siren ringing of a passing emergency vehicle", + "the siren of an emergency vehicle sounds and fades away", + "A loud siren wailed in the distance", + "The siren's pitch rose and fell rhythmically", + "A sharp, piercing siren sound filled the air", + "The sound of the siren was unmistakable and urgent", + "A continuous wailing noise indicated an emergency" + ] + }, + { + "event": "sizzle", + "phrases": [ + "sizzle", + "a quick sizzle", + "food sizzle", + "sizzling", + "sizzles", + "liquids sizzle", + "sizzle of food", + "grease sizzles", + "oil sizzle", + "fizzing", + "sizzle of frying food", + "more sizzle", + "objects sizzle", + "sizzling pan", + "soft sizzle", + "a continuous sizzle", + "a sizzle overhead", + "food sizzle while frying", + "a sizzle of frying food at the end", + "good sizzles", + "food sizzles in cookware", + "fodd sizzles in a pan", + "a frying pan sizzles", + "a sizzling sound with multiple layers", + "a sizzle sound", + "carbonated fizz", + "chicken is frying in a pan with a sizzle sound", + "food sizzles on a grill", + "a pan sizzles", + "a continuous sizzle of frying", + "The sizzle of fat cooking was sharp and constant", + "A rhythmic sizzling noise came from the heated pan", + "The sound of sizzling bubbles was audible and distinct", + "A soft sizzling sound indicated food being fried", + "The sizzle grew louder as the oil heated up" + ] + }, + { + "event": "slam", + "phrases": [ + "slam", + "slamming", + "a door slam sound effect", + "a slam", + "a loud smash", + "a loud slam", + "a bang", + "a glass and steel door slams", + "the sounds of slamming", + "a slamming thud", + "slamming a door", + "a simulation of a hit", + "a cupboard slams", + "a bang of a door closing", + "a smash", + "a metallic slam", + "slams", + "a slamming and sound effect noise", + "a loud metal object slamming shut", + "thud", + "a deep thud", + "a big slam", + "a large bang", + "door bang", + "door slamming", + "heavy impact", + "heavy metal door closing", + "A loud slam echoed through the room as the door shut", + "The sharp sound of a slammed door broke the silence", + "The slam was sudden and forceful, resonating loudly", + "A heavy slam indicated something closed violently", + "The sound of the slam was abrupt and startling" + ] + }, + { + "event": "slap", + "phrases": [ + "slap", + "slapping", + "slaps", + "smack", + "a slap sound", + "a wet slap", + "smack sound", + "hands slapping", + "slap and smack sounds", + "a slap", + "A sharp slapping sound was heard clearly", + "The slap was loud and sudden, breaking the quiet", + "A quick slapping noise indicated contact between two surfaces", + "The slap sound resonated briefly before fading away", + "A distinct slap noise was audible from nearby" + ] + }, + { + "event": "smoke_detector", + "phrases": [ + "a sharp smoke detector beep sounds continuously", + "smoke detector beep sound", + "smoke detector sound", + "a smoke detector is beeping", + "a smoke detector alarm", + "a smoke detector is making beeping noises", + "a smoke detector is ringing", + "a smoke detector is ticking", + "smoke detectors beep", + "a smoke detector beeps", + "a smoke detector goes off", + "The sharp beeping of a smoke detector was repetitive", + "A loud alarm sounded from the smoke detector", + "The smoke detector emitted a high-pitched, urgent beep", + "A consistent beeping noise indicated a smoke warning", + "The sound of the smoke detector was unmistakable and alarming" + ] + }, + { + "event": "sneeze", + "phrases": [ + "sneeze", + "a loud sneeze", + "a young person sneezes", + "loud male sneeze", + "sneezes", + "sneeze sound", + "a short sneeze", + "men sneeze", + "a woman sneezes a deep sneeze", + "male sneezes", + "a person sneezes", + "a large sneeze", + "an adult female sneezes", + "a series of sneezes", + "multiple sneezes", + "an adult female sneezes once", + "man sneezes", + "a single loud sneeze", + "a sneezing", + "sneezing", + "a high pitched sneeze", + "A loud sneeze broke the silence", + "The sound of a sneeze was sudden and forceful", + "A quick, sharp sneeze was heard nearby", + "The sneeze was abrupt and followed by a soft exhalation", + "A muffled sneeze indicated an attempt to suppress it" + ] + }, + { + "event": "snoring", + "phrases": [ + "snoring", + "continuous, light snoring", + "snoring man sleeping", + "a series of snoring", + "snoring over and over", + "deep coarse snoring", + "snoring continuously", + "snoring sound", + "snoring from a person", + "deep, rough continuous snoring", + "low pitched snoring", + "low snoring", + "snoring intermittently", + "deep, loud snoring", + "a series of snoring sounds", + "a series of snores", + "continuous, loud snoring", + "continuous repetitive snoring", + "deep and low snoring", + "snoring sounds one after another", + "soft, rapid snoring", + "repetitive, loud snoring", + "snoring repeating several times", + "a person continuously snores in and out", + "a sleeping person is snoring rhythmically", + "snoring occurs in a rhythmic pattern", + "low, slow, soft snoring", + "loud, consistent snoring", + "rhythmical snoring nearby", + "a sleeping person emits a gravely snore", + "The soft rumble of snoring was consistent and rhythmic", + "A loud, intermittent snore echoed in the room", + "The sound of snoring was deep and guttural", + "A faint snore was heard in the background", + "The rhythmic snoring noises grew louder as the person slept deeply" + ] + }, + { + "event": "speech", + "phrases": [ + "speech", + "person making a speech", + "person giving a talk", + "make giving speech", + "make speech", + "a person is giving a speech", + "an speech", + "a person making a speech", + "a person's speech", + "a speaker is giving a speech", + "a people give a speech", + "a human speech", + "a young man gives a speech", + "a person having a speech", + "a speech is delivered", + "a speech", + "a person giving a speech", + "a young man giving a speech", + "young man delivering a speech", + "a person gives a speech", + "human speeches", + "human speech", + "a man gives a public speech", + "a man makes a speech", + "a man giving a public speech", + "speech of monologue", + "a person delivering a speech", + "a person speaking a language", + "speeches", + "A steady flow of speech was heard in a conversational tone", + "The speech was clear and articulate", + "A rhythmic cadence in the speech made it engaging", + "The sound of speech was lively and animated", + "A faint murmur of speech could be heard from afar" + ] + }, + { + "event": "spray", + "phrases": [ + "spray", + "sprays", + "spraying", + "spraying", + "short spray", + "spray sound", + "spray painting", + "spray intermittently", + "liquid is sprayed", + "spraying paint", + "a small spray", + "someone is spraying", + "liquid sprays", + "spraying liquid", + "compressed liquid spraying", + "a sprayer sprays liquid", + "a brief spray", + "a spray", + "a single spray", + "sprays burst", + "a sprayer sprays", + "The sound of liquid spraying was sharp and continuous", + "A rhythmic spraying noise came from the nozzle", + "The spray produced a soft hissing sound", + "A fine mist spray created a faint, audible noise", + "The spraying sound was consistent and soothing" + ] + }, + { + "event": "squawk", + "phrases": [ + "squawks", + "a bird squawk", + "pigeons are squawking", + "birds squawks", + "an animal squawks", + "birds squawking", + "birds are squawking", + "a bunch of birds squawking", + "A loud squawk pierced the air", + "The bird's squawk was harsh and abrasive", + "A series of squawks echoed in the distance", + "The squawking noise was sharp and unpleasant", + "A single squawk sounded abruptly nearby" + ] + }, + { + "event": "squeak", + "phrases": [ + "squeak", + "squeaky", + "squeaking", + "squeaks", + "squeaks sound repeatedly", + "squeaky sounds", + "a squeak sounds", + "another squeak", + "squeaky sounds are being made", + "a squeaky sound", + "squeaky noises", + "a squeaky voice", + "a squeak sound", + "squeak sounds", + "a squeaky noise", + "a small squeaking", + "two squeaks", + "shoe squeaking", + "squeaking sounds in the background", + "several squeaks", + "a man is squeaking", + "squeaky sound plays", + "squeaking loud", + "short squeaks", + "a squeaking sound", + "a squeaking", + "squeaks occur", + "a squeaking noise", + "an object squeaks", + "squeaking noise", + "A faint squeak came from the floorboards", + "The sound of a squeak was high-pitched and brief", + "A rhythmic squeaking noise indicated movement", + "The squeak was sharp and intermittent", + "A soft squeak sounded faintly in the background" + ] + }, + { + "event": "squeal", + "phrases": [ + "squeal", + "a squeal", + "squealing", + "squealing consistently", + "the squealing", + "a squealing", + "screeching", + "a screeching", + "a quick squeal", + "a squealing sound", + "A loud squeal echoed sharply in the air", + "The squealing noise was high-pitched and intense", + "A faint squeal was heard in the distance", + "The sound of the squeal varied slightly in pitch", + "A sudden squeal was sharp and startling" + ] + }, + { + "event": "static", + "phrases": [ + "static", + "static mic", + "static noise", + "radio static sound", + "radio static", + "static occurs repeatedly", + "digital static", + "static continues", + "radio signal distortion", + "heavy static", + "a loud static plays continuously", + "static fills the microphone", + "static occurs continuously", + "a static distortion", + "a series of static", + "continuous static", + "a series of radio white noise", + "roaring static", + "television static", + "a static", + "microphone static", + "static crackles", + "portion of static", + "telephone static", + "The crackling of static was faint but persistent", + "A hissing noise of static filled the background", + "The sound of static was sharp and irregular", + "A burst of static noise was heard over the audio", + "The static sound was continuous and slightly crackling" + ] + }, + { + "event": "steam", + "phrases": [ + "steam", + "hissing steam", + "steam hisses sharply", + "steam hiss", + "an iron letting off steam", + "steam opening", + "steam from train", + "a long spray of steam is escaping", + "a steam engine lets off steam", + "the sounds of steam", + "steam releases", + "a steamy implement is used to help clean floors", + "steam puffs", + "steam train hiss", + "steam sounds", + "steam engine", + "the hiss of steam", + "the hissing of steam", + "a steam engine is hissing and chugging", + "steam hisses", + "steam hissing repetitively", + "its engine steam hisses", + "steam mix intermittently", + "steam loudly hisses", + "the steam hisses", + "steam is releasing from an engine", + "steam hisses loudly", + "the hiss of pressurized steam", + "steam is released then stops and is released again", + "hissing from steam", + "The sound of steam escaping was sharp and hissing", + "A faint hissing noise indicated steaming water", + "The steam emitted a consistent, gentle sound", + "A rhythmic release of steam created a soft whooshing noise", + "The sound of steam was soothing and constant" + ] + }, + { + "event": "steam_whistle", + "phrases": [ + "a steam whistle", + "steam whistle", + "a steam whistle sounds", + "a steam whistle toots", + "a steam whistle goes off", + "a steam whistle blows", + "a train steam whistle", + "steam whistles", + "steam whistle is roaring", + "steam whistle is sounding ", + "a steam whistle is triggered", + "a loud steam whistle", + "a steam whistle", + "The steam whistle emitted a sharp, high-pitched tone", + "A loud whistle sound came from the steam whistle", + "The sound of the steam whistle was piercing and clear", + "A rhythmic tooting noise indicated a steam whistle in use", + "The whistle sound was abrupt and attention-grabbing" + ] + }, + { + "event": "stream", + "phrases": [ + "stream", + "stream water", + "stream water flows", + "streams", + "water streams", + "running water in a river", + "stream flowing continuously", + "stream of water", + "water running down a stream", + "streaming waters", + "river stream", + "river running", + "river running down stream", + "water trickles down a stream", + "a loopable water stream", + "a stream of water flowing", + "streaming water", + "water running down river", + "a stream of water flows and trickles", + "river water streaming", + "a stream of water trickles and flows", + "rippling water flows steadily", + "a river of water flows", + "river of water flowing", + "a river stream of water flowing", + "a stream of water flowing and trickling", + "water stream running", + "a small stream", + "water trickling down the stream", + "a stream of water flows slowly and splashes", + "The gentle sound of a stream flowing was soothing", + "A rhythmic gurgling noise came from the stream", + "The sound of water trickling was clear and persistent", + "A soft splashing noise indicated the presence of a stream", + "The stream's sound was natural and calming" + ] + }, + { + "event": "tearing", + "phrases": [ + "tearing", + "tearing paper", + "paper tearing", + "tearing tape", + "tearing plastic", + "tearing up paper", + "tears paper", + "paper ripping", + "paper tears", + "peeling paper", + "single paper rip", + "tearing of a sheet of toilet paper", + "someone is tearing into pieces a small sheet of paper", + "tearing masking tape", + "tearing aluminum/tin foil", + "a paper tear reveal", + "tearing noise", + "someone is tearing a thin piece of rough cardboard", + "tearing paper sound", + "consistent ripping and tearing", + "someone is tearing up a piece of paper", + "The sound of tearing paper was sharp and abrupt", + "A faint ripping noise came from tearing fabric", + "The tearing sound was quick and distinct", + "A rhythmic tearing noise indicated repeated action", + "The sound of tearing was rough and jarring" + ] + }, + { + "event": "telephone", + "phrases": [ + "telephone", + "telephones", + "phones", + "the sounds of telephones", + "a telephone dialing occurs repeatedly", + "phone being hung up several times", + "telephone sounds", + "dialing phone", + "telephones dial", + "telephone noises", + "a telephone busy signal sounds", + "The sound of a telephone dialing tone was steady and clear", + "A rhythmic ringing noise came from the telephone", + "The telephone emitted a loud, repetitive beep", + "A soft click followed by a tone indicated a call being placed", + "The sound of the telephone was electronic and distinct" + ] + }, + { + "event": "telephone_bell_ringing", + "phrases": [ + "telephone bell", + "telephone bell rings", + "phone bells", + "someone is ringing a bell", + "a telephone bell rings intermittently", + "telephone bells ring multiple times", + "a telephone bell rings", + "telephone bells", + "a phone is ringing with a mechanical bell", + "a phone rings with a real phone bell", + "phone bells ring", + "telephone bells ring", + "telephone bells are ringing repeatedly", + "telephone bell ringing sounds", + "a telephone bell ringing several times", + "a bell telephone rings", + "a telephone is ringing inside an office room", + "ringing phone", + "a telephone bell rings with mechanisms", + "telephone bell ringing", + "telephone bells ringing", + "electronic phone ring", + "a phone rings and goes to busy", + "telephone rings", + "a telephone and its bell ringing", + "the sound of a telephone bell ringing", + "a telephone rings loudly two times", + "The telephone bell rang with a sharp, metallic tone", + "A rhythmic ringing sound indicated an incoming call", + "The sound of the telephone bell was loud and clear", + "A persistent ringing noise came from the telephone", + "The telephone bell emitted a distinct, repetitive chime" + ] + }, + { + "event": "telephone_dialing", + "phrases": [ + "electronic touch tone telephone dialing", + "dialing phone", + "a series of telephone keys dialing", + "telephones dial and ring", + "telephone dials", + "dialing", + "phone number dialing", + "he dials an old-fashioned phone", + "telephone dialing mechanisms echoing", + "telephone dial tones", + "dialing on a phone using touch tone dialing", + "telephone dialing and ringing", + "old phone number dial system is being used", + "a telephone dials and rings", + "dialing a telephone and ringing", + "someone dials on a rotary telephone", + "a telephone dials and tones", + "a phone is picking up and dialing a number", + "a telephone dialing occurs repeatedly", + "a telephone is ringing, dialing, and being answered", + "a series of telephone dialing tones", + "phones dial", + "manual telephone ringing", + "a telephone dialing tone ringing", + "a phone rings and goes to busy", + "a woman dials a telephone", + "telephone dialing sounds", + "telephone dialing and clicking sounds", + "someone is dialing a number on an old telephone", + "dialing occurs on a telephone", + "A sequence of tones indicated telephone dialing", + "The sound of dialing was rhythmic and electronic", + "A high-pitched beep followed by a pause indicated a digit entry", + "The dialing sounds were sharp and distinct", + "A series of quick beeps signaled dialing activity" + ] + }, + { + "event": "throat_clearing", + "phrases": [ + "throat clearing", + "throat clearing sounds", + "a person throat clearing", + "throat clearing noise", + "a man is making throat clearing noises", + "a person clearing his throat", + "someone makes throat sounds", + "A sharp throat-clearing sound broke the silence", + "The sound of throat clearing was abrupt and deliberate", + "A soft clearing noise indicated an attempt to gain attention", + "The throat-clearing sound was low and guttural", + "A faint throat-clearing noise was heard in the background" + ] + }, + { + "event": "thump", + "phrases": [ + "thump", + "thud", + "whack", + "hitting", + "thumping occurs", + "a hitting sound", + "something thumps", + "hitting sounds", + "thud sound", + "whack sound", + "a hit sound", + "a thud sound", + "A heavy thump echoed through the room", + "The sound of a thump was dull and abrupt", + "A rhythmic thumping noise indicated repeated impact", + "The thump was deep and resonant", + "A faint thump could be heard from the distance" + ] + }, + { + "event": "thunder", + "phrases": [ + "thunder", + "thunderstorm", + "crashing thunder", + "thunders", + "thunder is striking during a storm", + "thunder storm", + "loud thunderstorm", + "loud thunder clap", + "loud thunderclap", + "loud thunder that cracks five times", + "rolling thunder", + "the thunder", + "a close thunder strike", + "dry thunder", + "single close strong thunder", + "loud thunder", + "thunder is being recorded and remastered", + "thunderstorms", + "loud roars of thunder", + "a loud thunder strike", + "thunderstorm rumbles outside", + "thunder crashes", + "thunderstorm rumbles", + "heavy thunder", + "light thunder", + "thunder loudly", + "thunder sound", + "thunder slamming", + "rumbling thunder", + "thunder with reverb", + "A loud crack of thunder echoed sharply across the sky", + "The rumbling sound of thunder grew louder as the storm approached", + "A distant roll of thunder was faint but persistent", + "The thunderclap was sudden and startling", + "The low, resonant thunder shook the atmosphere" + ] + }, + { + "event": "thunderstorm", + "phrases": [ + "thunderstorm", + "thunder storm", + "thunderstorm sound", + "loud thunderstorm", + "thunderstorms", + "thunder is striking during a storm", + "crashing thunderstorms", + "thunderstorm rumbles outside", + "loud thunderstorm", + "thunderstorm sounds", + "thunderstorms roar", + "thunderstorm sounds with lightning", + "thundering with the rain coming down in sheets", + "a thunderstorm is looping", + "thunderstorm rumbles", + "thunderstorms rage", + "thunderstorms rumble", + "The sound of thunder rumbled continuously during the storm", + "A mix of heavy rain and thunder created a dramatic atmosphere", + "The thunderstorm produced sharp cracks followed by deep rumbles", + "The noise of the thunderstorm was intense and unrelenting", + "The soundscape was filled with overlapping thunderclaps and rain" + ] + }, + { + "event": "thunk", + "phrases": [ + "thunk sound", + "thud", + "thump", + "thud", + "whack", + "hitting", + "thumping occurs", + "a hitting sound", + "something thumps", + "hitting sounds", + "thud sound", + "whack sound", + "a hit sound", + "a thud sound", + "thunking", + "a thunk sound", + "A dull thunk echoed as the object hit the surface", + "The thunk was low-pitched and hollow", + "A faint thunk indicated something falling nearby", + "The sound of the thunk was abrupt and dampened", + "A repetitive thunking noise came from the distance" + ] + }, + { + "event": "tick", + "phrases": [ + "tick", + "a clock ticking", + "a clock tick-tocks", + "ticking", + "ticking sound", + "ticking from a clock", + "a ticking clock", + "a clock ticks", + "tick-tock sound", + "a tick", + "A sharp tick echoed in the quiet room", + "The ticking sound was steady and rhythmic", + "A faint metallic tick could be heard in the background", + "The sound of ticking was sharp and precise", + "A series of rapid ticks indicated a fast-moving mechanism" + ] + }, + { + "event": "tick-tock", + "phrases": [ + "tick-tock", + "tick-tocking", + "tick-tocks", + "tick-tock sounds", + "a tick-tock", + "tick-tock noise", + "a tick-tock occurs", + "tick-tocking consistently", + "a loud tick-tock", + "tick-tock sounds intermittently", + "a tick-tocking", + "tick-tocking sounds", + "a tick-tock rhythm", + "tick-tock of the pendulum", + "tick tocking", + "tick-tock of a clock", + "a series of medium tick-tocks", + "a tick-tock repeats rhythmically", + "tick-tock goes a clock", + "a tick-tock of a clock", + "tick-tocking by a clock", + "tick-tocking of a clock", + "a tick tock", + "loud, slow tick-tocking", + "tick tock of a clock", + "a tick tock of a clock", + "a low, soft tick-tock", + "tick-tock of a single clock", + "rhythmic tick-tocking", + "a tick-tock sound", + "The repetitive tick-tock of a clock was soothing", + "A steady tick-tock noise filled the air", + "The sound of tick-tock was rhythmic and calming", + "The clock's tick-tock echoed faintly in the room", + "A sharp tick followed by a soft tock created the classic clock sound" + ] + }, + { + "event": "tire_squeal", + "phrases": [ + "tire-squealing", + "tires skid and squeal loudly", + "tire squeal", + "a car skids very loudly", + "tire skids", + "vehicle skidding", + "tire skidding", + "vehicle tires squeal loudly", + "tires squeal while skidding", + "tires screech around a turn", + "cars skids", + "tires squeal", + "tires squeal the entire time", + "race car skidding", + "vehicle skid very loudly nearby", + "tire squealing", + "a car continuously skids", + "a car accelerates skidding", + "tires skid", + "tires skid and squeal", + "a car skids and honks", + "vehicles squeal tires", + "tires skidding", + "a vehicle accelerates and skids", + "an vehicle accelerates and skids", + "vehicle squealing tires", + "tires squeal and skid", + "cars are skidding with tire squeal", + "vehicle tires skidding", + "tires screech and squeal", + "A sharp tire squeal pierced the air as the vehicle braked", + "The high-pitched squealing noise was sudden and intense", + "A prolonged tire squeal indicated rapid acceleration", + "The sound of squealing tires echoed across the road", + "A faint tire squeal could be heard in the distance" + ] + }, + { + "event": "toilet_flush", + "phrases": [ + "a toilet is being flushed in a state room", + "water flows down a flushed toilet", + "water flows down a toilet", + "water rushes down the toilet", + "a woman flushes a toilet", + "toilets flush", + "water runs down a flushed toilet", + "a toilet is flushed for a few seconds", + "a toilet flushing in the background", + "toilet flushes abruptly", + "bathroom WC water", + "water flowing down a flushed toilet", + "an industrial toilet is flushed and drains", + "a toilet flushes in the background", + "a toilet flushes and water runs", + "a commercial toilet flushes extra fast", + "toilet is being flushed by water", + "water running down a flushed toilet", + "a toilet flushes lengthily", + "a toilet flushes and drains speedily", + "a toilet flushes and runs", + "flushing the toilet", + "water running from a flushed toilet", + "a toilet flushes loudly", + "tank valve is off and flush", + "a toilet flush in the background", + "a toilet flushes quickly", + "a toilet is being flushed in a small bathroom", + "toilet flushing and water running", + "The sound of a toilet flushing was abrupt and gushing", + "A loud whooshing noise accompanied the toilet flush", + "The toilet flushing sound was steady and brief", + "A gurgling noise followed the toilet flush, indicating drainage", + "The sound of the toilet flushing faded quickly" + ] + }, + { + "event": "traffic_noise", + "phrases": [ + "traffic noise", + "traffic sounds", + "traffic sounds are present and ongoing", + "traffic nearby", + "traffic is making noise", + "ambient highway noise", + "the sound of traffic", + "traffic noise fills the roadways", + "traffic ambiance in the background", + "city traffic sounds", + "the noise of traffic", + "traffic is ongoing", + "road noise occurs", + "traffic noises", + "traffic sounds on road", + "traffic flows", + "traffic sounds are looped", + "traffic noise in an urban setting", + "sound near a highway", + "sounds of traffic", + "traffic background", + "some traffic noise", + "traffic makes noises in the distance", + "traffic noise on the road", + "traffic fills the streets", + "traffic is near by", + "traffic in background", + "traffic sounds in the background", + "traffic noise in the street", + "traffic is present", + "The constant hum of traffic noise filled the city", + "A mix of honks and engine sounds created a bustling atmosphere", + "The sound of traffic noise was loud and unrelenting", + "A faint background noise of vehicles could be heard", + "The traffic noise grew louder during rush hour" + ] + }, + { + "event": "train", + "phrases": [ + "train", + "train sound", + "train passing by", + "train running", + "a train runs slowly on railroad tracks", + "a train moving", + "a train goes by", + "a train approaches", + "rustling of a train passing", + "a train running on railroad tracks", + "a train speeds by", + "brass suspension scary theme", + "death orb sounds", + "sound cue", + "camera interaction", + "chord progression", + "fire alarm sound", + "original beat", + "downshifting", + "a corporate rise-and-hit logo sound", + "pedal point", + "action sound", + "extended tail version", + "warbling suspenseful sound", + "putting the lid up", + "ball sound", + "wind down sound", + "catchy ad jingle", + "love gate sound", + "slide sound", + "The rumble of the train grew louder as it approached", + "A distinctive clattering noise came from the train wheels on the tracks", + "The sound of the train was rhythmic and mechanical", + "A faint train whistle accompanied the rumbling noise", + "The noise of the train echoed across the open landscape" + ] + }, + { + "event": "train_horn", + "phrases": [ + "a train whistle repeats multiple times", + "a train sounds the horn at a regular pace", + "a train's horn runs", + "train horn", + "a train horn sounds and echoes", + "a train blowing a horn twice", + "a training horn emits two lingering sounds", + "a train horn blows in the distance before becoming louder", + "a train whistle blares multiple times", + "a train horn sound", + "short train horn blast", + "train horn sounding", + "train horn audio recording", + "commuter train sound", + "a train car whistle", + "a train horn blares twice", + "a train horn sounds and approaches quickly", + "a train horn blares multiple times", + "a train horn sounds quickly", + "a train blows its horn twice", + "a train blowing its horn twice", + "a loud but brief train horn blares", + "two train horns sound", + "a train whistle blows three times", + "a train blowing its horn once", + "a train horn is triggered", + "a train horn sounds loudly and long", + "a train warning horn", + "a train horn sounds", + "a train horn blows three times", + "The train horn blared loudly, signaling its approach", + "A deep, resonant horn sound echoed through the area", + "The train horn was sharp and attention-grabbing", + "A prolonged horn blast indicated the train's presence", + "The sound of the train horn grew fainter as it moved away" + ] + }, + { + "event": "train_whistle", + "phrases": [ + "a train whistle repeats multiple times", + "a train sounds the horn at a regular pace", + "a train's horn runs", + "train horn", + "a train horn sounds and echoes", + "a train blowing a horn twice", + "a training horn emits two lingering sounds", + "a train horn blows in the distance before becoming louder", + "a train whistle blares multiple times", + "a train horn sound", + "short train horn blast", + "train horn sounding", + "train horn audio recording", + "commuter train sound", + "a train car whistle", + "a train horn blares twice", + "a train horn sounds and approaches quickly", + "a train horn blares multiple times", + "a train horn sounds quickly", + "a train blows its horn twice", + "a train blowing its horn twice", + "a loud but brief train horn blares", + "two train horns sound", + "a train whistle blows three times", + "a train blowing its horn once", + "a train horn is triggered", + "a train horn sounds loudly and long", + "a train warning horn", + "a train horn sounds", + "a train horn blows three times", + "The train whistle emitted a clear, high-pitched tone", + "A sharp whistle noise signaled the train's arrival", + "The sound of the train whistle was distinct and melodic", + "A long, echoing whistle was heard in the distance", + "The train whistle grew louder as the locomotive neared" + ] + }, + { + "event": "trickle", + "phrases": [ + "trickle", + "dribble", + "water dribbles nearby", + "water dribbles", + "water trickles", + "a stream trickles somewhere very close by", + "water trickles as it flows down", + "water trickling", + "water trickles and splashes", + "water flows at a steady trickle", + "trickling", + "some liquid trickles", + "trickle sounds", + "a stream of water flows and trickles", + "water trickling continuously", + "water is trickling", + "trickles", + "The soft trickle of water was soothing to hear", + "A faint trickling noise came from the nearby stream", + "The sound of trickling water was rhythmic and calming", + "A gentle trickle was audible in the background", + "The trickling noise grew louder as the water flowed faster" + ] + }, + { + "event": "truck", + "phrases": [ + "truck", + "truck running", + "truck engine", + "a loud truck engine", + "a truck is reversing", + "truck engine slows", + "a truck engine is revving up", + "a truck moves at constant pace", + "truck moving", + "a loud truck engine riving up", + "a truck accelerates repeatedly", + "a truck engine is accelerated", + "a loud truck engine riving up again", + "a big truck is departing", + "a truck drives", + "a truck engine goes by and slows", + "truck sounds", + "a truck is reversing and accelerating", + "a truck is making vroom sounds", + "a truck is accelerating and revving", + "a truck engine accelerating", + "a truck acceleration", + "a big truck sound", + "a truck engine is working at regular speed", + "a dump truck is reversing", + "a truck travels", + "a truck is stopping and accelerating", + "a truck goes by", + "a truck or something", + "a truck is running outdoor", + "The deep rumble of a truck's engine echoed nearby", + "A loud, mechanical noise came from the truck as it passed", + "The truck's engine produced a steady droning sound", + "A rhythmic clattering noise was heard as the truck moved over uneven ground", + "The sound of the truck faded as it drove away" + ] + }, + { + "event": "turkey", + "phrases": [ + "turkeys", + "turkey calls", + "turkey sounds", + "turkey gobbling", + "the sounds of a turkey", + "turkey vocalizations", + "turkeys vocalize", + "the sounds of turkeys", + "a turkey", + "turkeys speaking", + "gobbles of turkeys", + "turkeys gobbling", + "calls of turkeys", + "turkeys are making calls continuously", + "a turkey calls", + "a variety of turkey sounds", + "turkeys gobble", + "turkeys gobble loudly", + "a group of turkeys making sounds", + "turkeys make calls", + "a turkey gobbles", + "gobbling turkeys", + "chicken clucks", + "a turkey gobbling", + "male turkeys fighting", + "A loud gobble sound was emitted by the turkey", + "The turkey's gobbling noise was sharp and repetitive", + "A rhythmic gobble echoed in the distance", + "The sound of the turkey was distinct and attention-grabbing", + "A faint turkey gobble was heard in the background" + ] + }, + { + "event": "typewriter", + "phrases": [ + "typewriter", + "the sounds of a typewriter", + "typewriter mechanisms", + "a typewriter punctuates brief mechanisms", + "a typewriter", + "typewriter keys clack repeatedly", + "a typewriter makes sounds with scrapes and pings", + "typewriter sound effects", + "typewriter clicking to mechanisms", + "a typewriter types", + "typewriter clicks", + "typewriters", + "typing on a typewriter with clicking and clacking", + "typewriter noises", + "a manual typewriter is being used with single and multiple line feeds", + "a portable typewriter is typing with automatic spacing", + "an old fashioned typewriter being typed on quickly", + "typewriter typing", + "typewriter keys clack", + "typing on an old-fashioned typewriter", + "typewriter roller and ring", + "a typewriter functioning", + "a typewriter clicks and clacks", + "typing sounds from a typewriter", + "someone is typing fast on an old-fashioned typewriter", + "a typewriter types with a ding", + "typing on a typewriter", + "a typewriter operates", + "a typewriter in use", + "a typewriter clacks", + "The typewriter emitted a sharp clicking sound as keys were pressed", + "A rhythmic series of clicks and dings came from the typewriter", + "The sound of the typewriter was mechanical and precise", + "A faint clattering noise indicated the typewriter was in use", + "The typewriter's noise was sharp and consistent" + ] + }, + { + "event": "typing", + "phrases": [ + "typing", + "keyboard typing sounds", + "keyboarding sounds", + "typing very fast", + "typing sounds on a computer keyboard", + "computer keyboard sounds", + "typing sounds on computer keyboard", + "rapid keyboard typing", + "typing noises", + "typing on a keyboard is ongoing in the foreground", + "typing noise", + "short typing pattern on keyboard being repeated a few times", + "rapid typing", + "computer keyboard clicking sounds", + "typing sounds from a computer keyboard", + "typing sound of a computer keyboard", + "typing produces clicks on a keyboard", + "computer keyboard typing sounds", + "computer keyboard mechanisms", + "rapid typing on keyboard", + "typing sounds", + "keyboard sounds", + "clicking computer keyboard sounds", + "typing continuously", + "typing sounds on typewriter", + "intermittent typing on computer keyboard", + "rapid typing on a keyboard", + "a sequence of quick typing keystroke clanking", + "someone is entering a password using a keyboard", + "rapid typing of keyboard", + "The sound of typing was quick and rhythmic", + "A sharp clicking noise came from the keyboard", + "The typing sound grew louder as the pace increased", + "A faint tapping noise indicated someone typing nearby", + "The typing noise was steady and mechanical" + ] + }, + { + "event": "vacuum_cleaner", + "phrases": [ + "vacuum cleaner", + "vacuum", + "vacuum cleaner is being turned on and off", + "wood sander", + "a vacuum cleaner is starting and cutting off", + "vacuum cleaner is moving back and forth", + "vacuum cleaner being turned on and off", + "blending", + "a cylinder-type vacuum cleaner is stopping", + "someone is turning off a vacuum cleaner", + "a vacuum cleaner is switching off", + "a vacuum cleaner operates while making contact with a surface", + "a vacuum cleaner is turning on and off", + "a vacuum cleaner is being used on various surfaces", + "a vacuum cleaner is in use and stops", + "a vacuum cleaner is stopping on a carpet", + "hoover is being turned on/off", + "spraying the foam", + "a vacuum cleaner is running and making surface contact", + "air dryer sounds", + "a hand dryer in a bathroom is playing", + "someone is cleaning the house with an aspirator", + "vacuum running", + "someone is using a vacuum in their house", + "blender sounds", + "air freshener sound", + "a vacuum cleaner runs", + "street sweeper noise collector", + "a vacuum cleaner operates", + "air-compressor sounds", + "The vacuum cleaner emitted a loud, steady hum", + "A rhythmic suction noise came from the vacuum cleaner", + "The sound of the vacuum cleaner was mechanical and droning", + "A faint whirring noise indicated the vacuum cleaner was in use", + "The vacuum cleaner's noise grew softer as it moved to another room" + ] + }, + { + "event": "vehicle_horn", + "phrases": [ + "a car toots short", + "a horn honks in different tones repeatedly", + "vehicle horns honking several times", + "a series of a vehicle horn sounding", + "a car horn is honked several times in a row", + "a small vehicle horn toots once", + "a vehicle horn honking many times", + "a car horn goes off four times in two sets", + "a brief, loud car horn", + "a couple of car horns honking one after the other", + "a small car horn toots a few times", + "vehicle horns are triggered several times", + "a vehicle horn honks repeatedly and loudly", + "a car horn is honked several times", + "person is repeatedly hitting their car horn over and over and over", + "custom car horn", + "honking car horn sounds", + "an antique car horn honks repeatedly", + "vehicle car horn alarm blasts repeatedly", + "vehicle honking horn several times", + "a small car horn blows three times", + "a vehicle horn beeps loudly several times", + "a car horn honks melodically in different tones", + "a vehicle horn honks in alternating tones", + "a vehicle honking its horn several times", + "vehicle honking its horn", + "a horn is sounded on a moped", + "a vehicle honking at irregular intervals", + "A loud honk came from the vehicle horn", + "The horn's noise was sharp and attention-grabbing", + "A rhythmic honking sound indicated urgency", + "The vehicle horn blared continuously in the traffic", + "A faint honk was heard in the distance" + ] + }, + { + "event": "walk", + "phrases": [ + "walk", + "mid run sound", + "stepping", + "footsteps", + "walking", + "walk sound", + "footsteps walk", + "a man walks", + "walking occurs", + "a person walks along", + "light footsteps", + "footsteps occur briefly", + "sound of footsteps", + "sound of walking", + "footsteps take place", + "The sound of footsteps was steady and rhythmic", + "A faint tapping noise came from shoes hitting the ground", + "The walking noise grew louder as the person approached", + "A soft shuffling sound indicated slow walking", + "The rhythmic clatter of footsteps echoed in the hallway" + ] + }, + { + "event": "water", + "phrases": [ + "water", + "water effects", + "water sound effect", + "water from the fountain", + "water spring", + "gentle water effects", + "a water sound effect", + "a fountain of water flows", + "water trickles down into more water", + "the sounds of water", + "water flows in a bowl", + "water sounds", + "liquid", + "the water", + "water pours out onto a surface", + "a pouring water source streams", + "water falling onto itself", + "water filling a container", + "water is running into a mug", + "water makes contact with a surface", + "water fountain field recording", + "water fills a tap", + "filling water", + "water is splashing down into a basin", + "water runs onto itself", + "water runs once more", + "a water tap flows and splashes", + "some liquid flow is released several times", + "spilling water", + "fountain", + "The sound of water splashing was soft and rhythmic", + "A steady trickling noise indicated flowing water", + "The sound of water dripping was sharp and distinct", + "A bubbling noise came from the water in motion", + "The gentle sound of water created a soothing ambiance" + ] + }, + { + "event": "water_tap", + "phrases": [ + "water faucet", + "water tap sound", + "flush sound", + "a water tap runs briefly", + "a water tap runs and splashes", + "a water faucet is running", + "water runs from a faucet", + "a water faucet pouring", + "faucet runs", + "faucet water dripping", + "faucet water pouring", + "water trickling from a faucet", + "The water tap emitted a steady gushing sound as it was opened", + "A rhythmic dripping noise came from the tap left slightly open", + "The sound of the water tap turning off was abrupt and final", + "A faint hissing noise came from the water tap under pressure", + "The sound of water flowing from the tap was clear and steady" + ] + }, + { + "event": "waterfall", + "phrases": [ + "waterfall", + "waterfalls", + "a stream of water falls", + "a big waterfall", + "a large waterfall", + "a waterfall cascades", + "a huge waterfall", + "water falls", + "water cascades down a waterfall", + "a waterfall", + "the rushing of a waterfall", + "a large stream of water", + "water flows in a waterfall", + "water cascades down in a waterfall", + "loud rushing water from a river", + "a stream and waterfall", + "a large flow of liquid", + "a waterfall ambience", + "a loud roar of a waterfall", + "a stream of water rushing rapidly", + "a waterfall in the rural area", + "stream rushes loudly", + "a heavy stream of water", + "a waterfall is pouring into a stream", + "a small waterfall", + "a strong and powerful flowing waterfall", + "continuous roaring of a waterfall", + "The sound of the waterfall was loud and continuous", + "A deep roaring noise came from the cascading water", + "The waterfall's sound was rhythmic and powerful", + "A soft misty spray accompanied the sound of the waterfall", + "The distant roar of a waterfall was faint but distinct" + ] + }, + { + "event": "waves", + "phrases": [ + "waves", + "waves (surf)", + "surf waves", + "seawash", + "ocean waves", + "close ocean waves", + "waves break", + "waves break continues in succession", + "ocean", + "surf comes ashore", + "ocean waves are repeatedly splashing on shore", + "waves are moving and crashing in the background", + "waves roll slowly", + "ocean waves break", + "waves move aside", + "ocean waves ebb and flow", + "waves splash several times", + "large waves hit against a beach continuously", + "waves are continuously washing onto shore", + "ocean waves are breaking and crashing onto shore", + "the waves rush off", + "ocean waves are moving at a moderate pace", + "ocean currents", + "waves crashing onto shore continuously", + "surf", + "waves continuously crashing to shore", + "the ocean", + "the ocean waves are hitting the shore at a moderate pace", + "ocean waves are crashing and splashing onto shore", + "ocean waves repeatedly crash", + "The gentle lapping of waves was soothing to hear", + "A rhythmic crashing noise came as waves hit the shore", + "The sound of waves rolling was deep and continuous", + "A faint splashing noise indicated distant waves", + "The sound of waves breaking was sharp and distinct" + ] + }, + { + "event": "whip", + "phrases": [ + "whip", + "whips", + "a whip", + "whip cracks", + "whips smack fiberglass", + "whip sounds", + "the sounds of whipping", + "whip whooshing", + "whips crack", + "whips and whacks", + "whipping", + "the sound of a whip", + "a whip crack", + "a sudden whip", + "a whipping rush", + "whips crack in the wind", + "whip cracking", + "the sound of a whip cracking", + "whip noises", + "whips cracking in the wind", + "A sharp cracking sound came from the whip", + "The whip's motion produced a loud, snapping noise", + "A faint whistling noise preceded the whip's crack", + "The sound of the whip was sudden and startling", + "A rhythmic whipping noise indicated repeated motion" + ] + }, + { + "event": "whispering", + "phrases": [ + "whispering", + "whispering noise", + "whisling", + "whispering sounds", + "whispering noises", + "whispering with reverb", + "whispering in a small room", + "whispered ", + "whispering human voices", + "friendly ghost is whispering with variation", + "someone is whispering \"dare to be you\"", + "someone is whispering secrets to someone else", + "whispering in a large room", + "a whispering ghost sound", + "whispering sounds in the background", + "fantasy whispers are being whispered", + "whispered speech", + "human whispering", + "a person whispers", + "a young person whispers", + "someone is whispering", + "a female whispering", + "a person whispers in a small room", + "whispered words", + "someone is whispering", + "The sound of whispering was soft and indistinct", + "A faint whisper could be heard in the quiet room", + "The whispering noise was rhythmic and calming", + "A soft murmur of whispering filled the background", + "The whisper grew louder as the speaker leaned closer" + ] + }, + { + "event": "whistle", + "phrases": [ + "whistle", + "the whistle", + "a whistle", + "a louder whistle", + "a whistle sample", + "a whistle sound", + "multiple whistle sounds", + "a three note whistle", + "a sharp whistle", + "a mouth whistle", + "a whistle sounds", + "a whistle chirp", + "a series of whistling", + "a short whistle", + "the whistle gets louder", + "boys whistle a specific pattern", + "a human whistle", + "another whistle", + "a small rising whistle", + "melodical whistling", + "one of them whistles loudly", + "two quick whistle sounds", + "a young person whistles loudly and continuously", + "a series of sharp whistling", + "people whistle", + "several loud whistles", + "whistles", + "a loud whistle", + "whistle noise", + "A sharp whistle pierced the air", + "The whistle produced a high-pitched, clear tone", + "A rhythmic whistling noise came from the instrument", + "The sound of the whistle was attention-grabbing and distinct", + "A faint whistle could be heard in the distance" + ] + }, + { + "event": "whistling", + "phrases": [ + "whistling", + "whistling begins", + "whistling nearby", + "whistling consistently", + "whistling sound", + "whistling occurs", + "whistling sounds", + "whistling sound being made", + "whistling is ongoing", + "whistling is heard", + "whistling noises", + "whistling repeating", + "whistling noise", + "whistles sound", + "whistles", + "whistling of a person", + "whistling from a small group", + "whistling from a person", + "whistling takes place", + "continuous whistling", + "whistling takes place repeatedly", + "a woman whistles", + "an upbeat whistling", + "whistling noise in background", + "a woman whistles a tune", + "flirty whistling", + "constant whistling", + "consistent musical whistling", + "whistling noises that occur a few times", + "human whistling", + "A high-pitched whistling noise was steady and melodic", + "The sound of whistling was rhythmic and cheerful", + "A faint whistling noise came from someone nearby", + "The whistling sound grew louder as it approached", + "The whistling noise was sharp and clear" + ] + }, + { + "event": "whoosh", + "phrases": [ + "whoosh", + "a whoosh", + "a stereo whoosh", + "whoosh-swoosh", + "swoosh", + "fast whoosh", + "a large whoosh", + "a rushing whoosh", + "a loud whoosh", + "a quick whoosh", + "a fast whoosh", + "a woosh", + "a swooshing", + "a swish", + "an analog whoosh effect", + "woosh", + "a loud, sweeping whoosh", + "a whooshing sweep", + "whooses", + "a whoosh goes by", + "whooshes", + "whooshing", + "objects whoosh", + "missil whoosh", + "a whoosh occurs", + "a whooshing", + "objects whoosh by", + "a whoosh", + "A loud whooshing noise came from something moving quickly through the air", + "The whoosh was sharp and sudden, fading quickly", + "A faint whooshing noise indicated distant motion", + "The sound of the whoosh was sibilant and brief", + "A steady whooshing noise accompanied the rapid movement" + ] + }, + { + "event": "wind_chime", + "phrases": [ + "wind chimes blowing", + "wind chimes softly ringing", + "chiming wind chimes", + "ringing of wind chimes", + "wind chimes are jingling", + "wind chimes are playing", + "a wind chime is sounding", + "a wind chime is clanging", + "wind chimes are being played", + "wind chimes are being run", + "wind chime sound", + "wind chimes are blowing in the wind", + "The wind chime produced a soft, melodic tinkling sound", + "A rhythmic chiming noise came as the wind blew", + "The sound of the wind chime was delicate and calming", + "A faint tinkling noise indicated distant wind chimes", + "The wind chime's sound was harmonious and gentle" + ] + }, + { + "event": "yell", + "phrases": [ + "yelling", + "the sounds of shouting", + "scream", + "an agony yelling", + "a loud yelling", + "screaming", + "another person yell ", + "an adult male yells", + "old man yells", + "a man is screaming in panic and pain", + "man screaming", + "wild screaming", + "screams", + "a loud screaming", + "a series of shouts from a woman", + "a loud male voice screams", + "yells", + "people are shouting \"hip hip hooray\"", + "females shout", + "A loud yell pierced the air, commanding attention", + "The sound of the yell was sharp and sudden", + "A faint yell could be heard in the distance", + "The yell was abrupt and startling", + "A repetitive yell indicated urgency or excitement" + ] + }, + { + "event": "yip", + "phrases": [ + "a yipping sound", + "yipping", + "a dog yipping", + "a dog yips and pants", + "dogs are yipping", + "animal yipping", + "a dog is yipping loudly", + "dogs yip", + "a dog barks and yips", + "A sharp yip broke the silence", + "The sound of a yip was high-pitched and brief", + "A rhythmic series of yips indicated excitement", + "The yip was faint but distinct in the background", + "A sudden yip was heard from a small dog nearby" + ] + } +] \ No newline at end of file diff --git a/utils/tests/test_logging.py b/utils/tests/test_logging.py new file mode 100644 index 0000000000000000000000000000000000000000..9ce9d080a7dd63c013ea7382e160d8b4ecfa6adb --- /dev/null +++ b/utils/tests/test_logging.py @@ -0,0 +1,19 @@ +import unittest +from pathlib import Path + +from utils.logging import LoggingLogger + + +class TestLoggingLogger(unittest.TestCase): + def setUp(self): + self.tmp_log_path = Path("./tmp_logging.txt") + + def test_logging_info(self): + logger = LoggingLogger(filename=self.tmp_log_path, + level="INFO").create_instance() + logger.info("logging information") + self.assertTrue(self.tmp_log_path.exists()) + + def tearDown(self): + if self.tmp_log_path.exists(): + self.tmp_log_path.unlink() diff --git a/utils/torch_utilities.py b/utils/torch_utilities.py new file mode 100644 index 0000000000000000000000000000000000000000..4c85938b256bf274abac79ae1b7be222820c108e --- /dev/null +++ b/utils/torch_utilities.py @@ -0,0 +1,168 @@ +import logging +from typing import Callable +from pathlib import Path +import torch +import torch.nn as nn + +logger = logging.Logger(__file__) + + +def remove_key_prefix_factory(prefix: str = "module."): + def func( + model_dict: dict[str, torch.Tensor], state_dict: dict[str, + torch.Tensor] + ) -> dict[str, torch.Tensor]: + + state_dict = { + key[len(prefix):]: value + for key, value in state_dict.items() if key.startswith(prefix) + } + return state_dict + + return func + + +def merge_matched_keys( + model_dict: dict[str, torch.Tensor], state_dict: dict[str, torch.Tensor] +) -> dict[str, torch.Tensor]: + """ + Args: + model_dict: + The state dict of the current model, which is going to load pretrained parameters + state_dict: + A dictionary of parameters from a pre-trained model. + + Returns: + dict[str, torch.Tensor]: + The updated state dict, where parameters with matched keys and shape are + updated with values in `state_dict`. + """ + pretrained_dict = {} + mismatch_keys = [] + for key, value in state_dict.items(): + if key in model_dict and model_dict[key].shape == value.shape: + pretrained_dict[key] = value + else: + mismatch_keys.append(key) + logger.info( + f"Loading pre-trained model, with mismatched keys {mismatch_keys}" + ) + model_dict.update(pretrained_dict) + return model_dict + + +def load_pretrained_model( + model: nn.Module, + ckpt_or_state_dict: str | Path | dict[str, torch.Tensor], + state_dict_process_fn: Callable = merge_matched_keys +) -> None: + state_dict = ckpt_or_state_dict + if not isinstance(state_dict, dict): + state_dict = torch.load(ckpt_or_state_dict, "cpu") + + model_dict = model.state_dict() + state_dict = state_dict_process_fn(model_dict, state_dict) + model.load_state_dict(state_dict, strict=False, assign=True) + + +def create_mask_from_length( + lengths: torch.Tensor, max_length: int | None = None +): + if max_length is None: + max_length = max(lengths) + idxs = torch.arange(max_length).reshape(1, -1) # (1, max_length) + mask = idxs.to(lengths.device) < lengths.view(-1, 1) + # (1, max_length) < (batch_size, 1) -> (batch_size, max_length) + return mask + + +def loss_with_mask( + loss: torch.Tensor, + mask: torch.Tensor, + reduce: bool = True +) -> torch.Tensor: + """ + Apply a mask to the loss tensor and optionally reduce it. + + Args: + loss: Tensor of shape (b, t, ...) representing the loss values. + mask: Tensor of shape (b, t) where 1 indicates valid positions and 0 indicates masked positions. + reduce: If True, return a single scalar value; otherwise, return a tensor of shape (b,). + + Returns: + torch.Tensor: A scalar if reduce is True, otherwise a tensor of shape (b,). + """ + expanded_mask = mask[(..., ) + (None, ) * (loss.ndim - mask.ndim)] + expanded_mask = expanded_mask.expand_as(loss) + masked_loss = loss * expanded_mask + + sum_dims = tuple(range(1, loss.ndim)) + loss_sum = masked_loss.sum(dim=sum_dims) + mask_sum = expanded_mask.sum(dim=sum_dims) + loss = loss_sum / mask_sum + + if reduce: + return loss.mean() + else: + return loss + + +def convert_pad_shape(pad_shape: list[list[int]]): + l = pad_shape[::-1] + pad_shape = [item for sublist in l for item in sublist] + return pad_shape + + +def create_alignment_path(duration: torch.Tensor, mask: torch.Tensor): + device = duration.device + + b, t_x, t_y = mask.shape + cum_duration = torch.cumsum(duration, 1) + print(mask.shape) + print(duration.shape) + print(cum_duration.shape) + cum_duration_flat = cum_duration.view(b * t_x) + path = create_mask_from_length(cum_duration_flat, t_y).to(mask.dtype) + path = path.view(b, t_x, t_y) + # take the diff on the `t_x` axis + path = path - torch.nn.functional.pad( + path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]) + )[:, :-1] + path = path * mask + return path + + +def trim_or_pad_length(x: torch.Tensor, target_length: int, length_dim: int): + """ + Adjusts the size of the specified dimension of tensor x to match `target_length`. + + Args: + x: + Input tensor. + target_length: + Desired size of the specified dimension. + length_dim: + The dimension to modify. + + Returns: + torch.Tensor: The adjusted tensor. + """ + current_length = x.shape[length_dim] + + if current_length > target_length: + # Truncate the tensor + slices = [slice(None)] * x.ndim + slices[length_dim] = slice(0, target_length) + return x[tuple(slices)] + + elif current_length < target_length: + # Pad the tensor + pad_shape = list(x.shape) + pad_length = target_length - current_length + + pad_shape[length_dim] = pad_length # Shape for left padding + padding = torch.zeros(pad_shape, dtype=x.dtype, device=x.device) + + return torch.cat([x, padding], dim=length_dim) + + return x