Spaces:

JuyeopDang
/

KoFace-AI

Running

App Files Files Community

JuyeopDang commited on Jun 9

Commit

5ab5cab

verified ·

1 Parent(s): cda6ed1

Upload 35 files

Browse files

Files changed (35) hide show

auto_encoder/components/distributions.py +43 -0
auto_encoder/components/nonlinearity.py +5 -0
auto_encoder/components/normalize.py +4 -0
auto_encoder/components/resnet_block.py +46 -0
auto_encoder/components/sampling.py +31 -0
auto_encoder/models/auto_encoder.py +31 -0
auto_encoder/models/decoder.py +78 -0
auto_encoder/models/encoder.py +71 -0
auto_encoder/models/variational_auto_encoder.py +45 -0
clip/encoders/image_encoder.py +44 -0
clip/encoders/text_encoder.py +29 -0
clip/models/clip.py +70 -0
clip/models/ko_clip.py +26 -0
configs/composite_clip_config.yaml +15 -0
configs/composite_config.yaml +38 -0
diffusion_model/models/clip_latent_diffusion_model.py +40 -0
diffusion_model/models/diffusion_model.py +42 -0
diffusion_model/models/latent_diffusion_model.py +37 -0
diffusion_model/network/attention.py +187 -0
diffusion_model/network/blocks.py +89 -0
diffusion_model/network/timestep_embedding.py +42 -0
diffusion_model/network/unet.py +217 -0
diffusion_model/network/unet_wrapper.py +32 -0
diffusion_model/sampler/base_sampler.py +69 -0
diffusion_model/sampler/ddim.py +29 -0
diffusion_model/sampler/ddpm.py +20 -0
helper/beta_generator.py +47 -0
helper/cond_encoder.py +69 -0
helper/data_generator.py +129 -0
helper/ema.py +375 -0
helper/loader.py +48 -0
helper/painter.py +51 -0
helper/tokenizer.py +9 -0
helper/trainer.py +99 -0
helper/util.py +4 -0

auto_encoder/components/distributions.py ADDED Viewed

	@@ -0,0 +1,43 @@

+#source: https://github.com/CompVis/latent-diffusion/blob/main/ldm/modules/distributions/distributions.py
+import torch
+import numpy as np
+class DiagonalGaussianDistribution(object):
+    def __init__(self, parameters, deterministic=False):
+        self.parameters = parameters
+        self.mean, self.logvar = torch.chunk(parameters, 2, dim=1)
+        self.logvar = torch.clamp(self.logvar, -30.0, 20.0)
+        self.deterministic = deterministic
+        self.std = torch.exp(0.5 * self.logvar)
+        self.var = torch.exp(self.logvar)
+        if self.deterministic:
+            self.var = self.std = torch.zeros_like(self.mean).to(device=self.parameters.device)
+    def sample(self):
+        x = self.mean + self.std * torch.randn(self.mean.shape).to(device=self.parameters.device)
+        return x
+    def kl(self, other=None):
+        if self.deterministic:
+            return torch.Tensor([0.])
+        else:
+            if other is None:
+                return 0.5 * torch.sum(torch.pow(self.mean, 2)
+                                       + self.var - 1.0 - self.logvar,
+                                       dim=[1, 2, 3])
+            else:
+                return 0.5 * torch.sum(
+                    torch.pow(self.mean - other.mean, 2) / other.var
+                    + self.var / other.var - 1.0 - self.logvar + other.logvar,
+                    dim=[1, 2, 3])
+    def nll(self, sample, dims=[1,2,3]):
+        if self.deterministic:
+            return torch.Tensor([0.])
+        logtwopi = np.log(2.0 * np.pi)
+        return 0.5 * torch.sum(
+            logtwopi + self.logvar + torch.pow(sample - self.mean, 2) / self.var,
+            dim=dims)
+    def mode(self):
+        return self.mean

auto_encoder/components/nonlinearity.py ADDED Viewed

	@@ -0,0 +1,5 @@

+import torch
+def nonlinearity(x):
+    # swish
+    return x*torch.sigmoid(x)

auto_encoder/components/normalize.py ADDED Viewed

	@@ -0,0 +1,4 @@

+import torch
+def Normalize(in_channels : int, num_groups : int = 32):
+    return torch.nn.GroupNorm(num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True)

auto_encoder/components/resnet_block.py ADDED Viewed

	@@ -0,0 +1,46 @@

+# https://github.com/CompVis/latent-diffusion/blob/main/ldm/modules/diffusionmodules/model.py#L368
+import torch
+import torch.nn as nn
+from auto_encoder.components.normalize import Normalize
+from auto_encoder.components.nonlinearity import nonlinearity
+class ResnetBlock(nn.Module):
+    def __init__(self, *, in_channels : int, out_channels : int = None, conv_shortcut=False, dropout):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+        self.norm1 = Normalize(in_channels)
+        self.conv1 = torch.nn.Conv2d(in_channels, out_channels, kernel_size = 3, stride = 1, padding = 1)
+        self.norm2 = Normalize(out_channels)
+        self.dropout = torch.nn.Dropout(dropout)
+        self.conv2 = torch.nn.Conv2d(out_channels, out_channels, kernel_size = 3, stride = 1, padding = 1)
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                self.conv_shortcut = torch.nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+            else:
+                self.nin_shortcut = torch.nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
+    def forward(self, x):
+        h = x
+        h = self.norm1(h)
+        h = nonlinearity(h)
+        h = self.conv1(h)
+        h = self.norm2(h)
+        h = nonlinearity(h)
+        h = self.dropout(h)
+        h = self.conv2(h)
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                x = self.conv_shortcut(x)
+            else:
+                x = self.nin_shortcut(x)
+        return x+h

auto_encoder/components/sampling.py ADDED Viewed

	@@ -0,0 +1,31 @@

+import torch
+import torch.nn as nn
+class Upsample(nn.Module):
+    def __init__(self, in_channels : int, with_conv : bool):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            self.conv = torch.nn.Conv2d(in_channels, in_channels, kernel_size = 3, stride = 1, padding = 1)
+    def forward(self, x):
+        x = torch.nn.functional.interpolate(x, scale_factor = 2.0, mode = "nearest")
+        if self.with_conv:
+            x = self.conv(x)
+        return x
+class Downsample(nn.Module):
+    def __init__(self, in_channels : int, with_conv : bool):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            self.conv = torch.nn.Conv2d(in_channels, in_channels, kernel_size = 3, stride = 2, padding = 0)
+    def forward(self, x):
+        if self.with_conv:
+            pad = (0, 1, 0, 1)
+            x = torch.nn.functional.pad(x, pad, mode = "constant", value = 0)
+            x = self.conv(x)
+        else:
+            x = torch.nn.functional.avg_pool2d(x, kernel_size = 2, stride = 2)
+        return x

auto_encoder/models/auto_encoder.py ADDED Viewed

	@@ -0,0 +1,31 @@

+import torch.nn as nn
+import torch.nn.functional as F
+from auto_encoder.models.decoder import Decoder
+from auto_encoder.models.encoder import Encoder
+import yaml
+class AutoEncoder(nn.Module):
+    def __init__(self, config_path : str):
+        super().__init__()
+        with open(config_path, "r") as file:
+            config = yaml.safe_load(file)
+        self.add_module('encoder', Encoder(**config["encoder"]))
+        self.add_module('decoder', Decoder(**config["decoder"]))
+    def encode(self, x):
+        h = self.encoder(x)
+        return h
+    def decode(self, z):
+        z = self.decoder(z)
+        return z
+    def reconstruct(self, x):
+        return self.decode(self.encode(x))
+    def loss(self, x):
+        x_hat = self(x)
+        return F.mse_loss(x, x_hat)
+    def forward(self, x):
+        return self.reconstruct(x)

auto_encoder/models/decoder.py ADDED Viewed

	@@ -0,0 +1,78 @@

+#source : https://github.com/CompVis/latent-diffusion/blob/main/ldm/modules/diffusionmodules/model.py#L368
+import torch
+import torch.nn as nn
+import numpy as np
+from auto_encoder.components.normalize import Normalize
+from auto_encoder.components.resnet_block import ResnetBlock
+from auto_encoder.components.sampling import Upsample
+from auto_encoder.components.nonlinearity import nonlinearity
+class Decoder(nn.Module):
+    def __init__(self, *, in_channels, out_channels, resolution, channels, channel_multipliers = (1, 2, 4, 8), z_channels, num_res_blocks,
+                 dropout = 0.0, resample_with_conv : bool = True):
+        super().__init__()
+        self.ch = channels
+        self.num_resolutions = len(channel_multipliers)
+        self.num_res_blocks = num_res_blocks
+        self.in_channels = in_channels
+        self.z_channels = z_channels
+        in_ch_mult = (1 , ) + tuple(channel_multipliers)
+        block_in = self.ch * in_ch_mult[self.num_resolutions - 1]
+        curr_res = resolution // 2 ** (self.num_resolutions - 1)
+        self.z_shape = (1 , z_channels, curr_res, curr_res)
+        print("Working with z of shape {} = {} dimensions.".format(self.z_shape, np.prod(self.z_shape)))
+        # z to block_in
+        self.conv_in = torch.nn.Conv2d(z_channels, block_in, kernel_size = 3, stride = 1, padding = 1)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels = block_in, out_channels = block_in, dropout = dropout)
+        self.mid.block_2 = ResnetBlock(in_channels = block_in, out_channels = block_in, dropout = dropout)
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            block_out = self.ch * channel_multipliers[i_level]
+            for i_block in range(self.num_res_blocks + 1):
+                block.append(ResnetBlock(in_channels = block_in, out_channels = block_out,
+                                         dropout = dropout))
+                block_in = block_out
+            up = nn.Module()
+            up.block = block
+            if i_level != 0:
+                up.upsample = Upsample(block_in, resample_with_conv)
+                curr_res = curr_res * 2
+            self.up.insert(0, up)
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(block_in, out_channels,
+                                        kernel_size = 3, stride = 1, padding = 1)
+    def forward(self, z):
+        assert z.shape[1:] == self.z_shape[1:]
+        self.last_z_shape = z.shape
+        # z to block_in
+        h = self.conv_in(z)
+        # middle
+        h = self.mid.block_1(h)
+        h = self.mid.block_2(h)
+        # upsampling
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks + 1):
+                h = self.up[i_level].block[i_block](h)
+            if i_level != 0:
+                h = self.up[i_level].upsample(h)
+        # end
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        return h

auto_encoder/models/encoder.py ADDED Viewed

	@@ -0,0 +1,71 @@

+#source : https://github.com/CompVis/latent-diffusion/blob/main/ldm/modules/diffusionmodules/model.py#L368
+import torch
+import torch.nn as nn
+from auto_encoder.components.normalize import Normalize
+from auto_encoder.components.resnet_block import ResnetBlock
+from auto_encoder.components.sampling import Downsample
+from auto_encoder.components.nonlinearity import nonlinearity
+class Encoder(nn.Module):
+    def __init__(self, *, in_channels, resolution, channels, channel_multipliers = (1, 2, 4, 8), z_channels, num_res_blocks,
+                 dropout = 0.0, resample_with_conv : bool = True, double_z : bool = True):
+        super().__init__()
+        self.ch = channels
+        self.num_resolutions = len(channel_multipliers)
+        self.num_res_blocks = num_res_blocks
+        self.in_channels = in_channels
+        self.z_channels = z_channels
+        # downsampling
+        self.conv_in = torch.nn.Conv2d(in_channels, self.ch, kernel_size = 3, stride = 1, padding = 1)
+        curr_res = resolution
+        in_ch_mult = (1, ) + tuple(channel_multipliers)
+        self.in_ch_mult = in_ch_mult
+        self.down = nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            block_in = self.ch * in_ch_mult[i_level]
+            block_out = self.ch * channel_multipliers[i_level]
+            for i_block in range(self.num_res_blocks):
+                block.append(ResnetBlock(in_channels = block_in, out_channels = block_out, dropout = dropout))
+                block_in = block_out
+            down = nn.Module()
+            down.block = block
+            if i_level != self.num_resolutions - 1:
+                down.downsample = Downsample(block_in, resample_with_conv)
+                curr_res = curr_res // 2
+            self.down.append(down)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels = block_in, out_channels = block_in, dropout = dropout)
+        self.mid.block_2 = ResnetBlock(in_channels = block_in, out_channels = block_in, dropout = dropout)
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(block_in, 2 * z_channels if double_z else z_channels,
+                                        kernel_size = 3, stride = 1, padding = 1)
+    def forward(self, x):
+        # downsampling
+        hs = [self.conv_in(x)]
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                h = self.down[i_level].block[i_block](hs[-1])
+                hs.append(h)
+            if i_level != self.num_resolutions - 1:
+                hs.append(self.down[i_level].downsample(hs[-1]))
+        # middle
+        h = hs[-1]
+        h = self.mid.block_1(h)
+        h = self.mid.block_2(h)
+        # end
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        return h

auto_encoder/models/variational_auto_encoder.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from auto_encoder.models.encoder import Encoder
+from auto_encoder.models.decoder import Decoder
+import yaml
+from auto_encoder.components.distributions import DiagonalGaussianDistribution
+class VariationalAutoEncoder(nn.Module):
+    def __init__(self, config_path):
+        super().__init__()
+        with open(config_path, "r") as file:
+            config = yaml.safe_load(file)
+        self.add_module('encoder', Encoder(**config["encoder"]))
+        self.add_module('decoder', Decoder(**config["decoder"]))
+        self.embed_dim = config['vae']['embed_dim']
+        self.kld_weight = float(config['vae']['kld_weight'])
+        self.quant_conv = torch.nn.Conv2d(self.decoder.z_channels, 2*self.embed_dim, 1)
+        self.post_quant_conv = torch.nn.Conv2d(self.embed_dim, self.decoder.z_channels, 1)
+    def encode(self, x):
+        h = self.encoder(x)
+        moments = self.quant_conv(h)
+        posterior = DiagonalGaussianDistribution(moments)
+        return posterior
+    def decode(self, z):
+        z = self.post_quant_conv(z)
+        dec = self.decoder(z)
+        return dec
+    def loss(self, x):
+        x_hat, posterior = self(x)
+        return F.mse_loss(x, x_hat) + self.kld_weight * posterior.kl().mean()
+    def forward(self, input, sample_posterior=True):
+        posterior = self.encode(input)
+        if sample_posterior:
+            z = posterior.sample()
+        else:
+            z = posterior.mode()
+        dec = self.decode(z)
+        return dec, posterior

clip/encoders/image_encoder.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import torch
+import torch.nn as nn
+class ImageEncoder(nn.Module):
+    def __init__(self, in_channels: int, resolution: int, patch_size: int,
+                 number_of_features: int, number_of_heads:int, number_of_transformer_layers: int,
+                 embed_dim: int):
+        super().__init__()
+        self.resolution = resolution
+        self.embed_dim = embed_dim
+        self.conv = nn.Conv2d(in_channels=in_channels, out_channels=number_of_features,
+                               kernel_size=patch_size, stride=patch_size, bias=False)
+        self.number_of_patches = (resolution // patch_size) ** 2
+        self.positional_embedding = nn.Parameter(torch.zeros(1, self.number_of_patches + 1, number_of_features))
+        self.class_embedding = nn.Parameter(torch.zeros(1, 1, number_of_features))
+        self.ln_pre = nn.LayerNorm(number_of_features)
+        self.transformer = nn.TransformerEncoder(
+            nn.TransformerEncoderLayer(d_model=number_of_features, nhead=number_of_heads, batch_first=True),
+            num_layers=number_of_transformer_layers
+            )
+        self.ln_post = nn.LayerNorm(number_of_features)
+        self.fc = nn.Linear(number_of_features, embed_dim)
+        # initialize
+        nn.init.kaiming_normal_(self.positional_embedding, nonlinearity='relu')
+        nn.init.kaiming_normal_(self.class_embedding, nonlinearity='relu')
+        nn.init.kaiming_normal_(self.fc.weight, nonlinearity='relu')
+    def forward(self, x: torch.Tensor):
+        x = self.conv(x) # [batch_size, number_of_features, grid, grid]
+        x = x.flatten(2) # [batch_size, number_of_features, grid ** 2 = number_of_patches]
+        x = x.transpose(1, 2) # [batch_size, number_of_patches, number_of_features]
+        class_embeddings = self.class_embedding.expand(x.shape[0], -1, -1)
+        x = torch.cat([class_embeddings, x], dim=1)
+        x = x + self.positional_embedding
+        x = self.ln_pre(x)
+        x = self.transformer(x) # [batch_size, length_of_sequence, number_of_features]
+        x = x.permute(1, 0, 2) # [length_of_sequence, batch_size, number_of_features]
+        x = self.ln_post(x[0])
+        x = self.fc(x) # [batch_size, embed_dim]
+        return x

clip/encoders/text_encoder.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import torch
+import torch.nn as nn
+class TextEncoder(nn.Module):
+    def __init__(self, number_of_features: int, number_of_heads: int, number_of_transformer_layers: int,
+                 context_length, embed_dim):
+        super().__init__()
+        self.vocab_size = 32000  # AutoTokenizer: "koclip/koclip-base-pt"
+        self.token_embedding = nn.Embedding(self.vocab_size, number_of_features)
+        self.positional_embedding = nn.Parameter(torch.zeros(context_length, number_of_features))
+        self.transformer = nn.TransformerEncoder(
+            nn.TransformerEncoderLayer(d_model=number_of_features, nhead=number_of_heads, batch_first=True),
+            num_layers=number_of_transformer_layers
+        )
+        self.text_projection = nn.Linear(number_of_features, embed_dim)
+        # initialize
+        nn.init.normal_(self.token_embedding.weight, std=0.02)
+        nn.init.xavier_uniform_(self.positional_embedding)
+        nn.init.kaiming_normal_(self.text_projection.weight, nonlinearity='relu')
+    def forward(self, x):
+        eot_token_idx = (x == 2).nonzero(as_tuple=True)[1]  # Assume EOT token ID is 2
+        x = self.token_embedding(x)
+        x = x + self.positional_embedding[:x.size(1), :]
+        x = self.transformer(x)
+        x = x[torch.arange(x.shape[0]), eot_token_idx]
+        x = self.text_projection(x)
+        return x

clip/models/clip.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import yaml
+import numpy as np
+from clip.encoders.image_encoder import ImageEncoder
+from clip.encoders.text_encoder import TextEncoder
+from helper.tokenizer import Tokenizer
+class CLIP(nn.Module):
+    def __init__(self, config_path):
+        super().__init__()
+        with open(config_path, "r") as file:
+           config = yaml.safe_load(file)
+        self.image_encoder = ImageEncoder(**config["image_encoder"])
+        self.text_encoder = TextEncoder(**config["text_encoder"])
+        self.tokenizer = Tokenizer()
+        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+        # initialize
+        for module in self.modules():
+            if isinstance(module, nn.Linear):
+                nn.init.xavier_normal_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+            elif isinstance(module, nn.Conv2d):
+                nn.init.kaiming_normal_(module.weight, mode='fan_out', nonlinearity='relu')
+            elif isinstance(module, (nn.LayerNorm, nn.BatchNorm2d)):
+                nn.init.constant_(module.weight, 1)
+                nn.init.constant_(module.bias, 0)
+    def loss(self, image, text):
+        image_features, text_features = self(image, text, tokenize=False)
+        # Normalize features
+        image_features = F.normalize(image_features, dim=1)
+        text_features = F.normalize(text_features, dim=1)
+        # Cosine similarity as logits with learned temperature
+        logits = torch.matmul(image_features, text_features.t()) * self.logit_scale.exp()
+        labels = torch.arange(logits.shape[0], dtype=torch.long, device=logits.device)
+        # Cross-entropy loss
+        loss_i2t = F.cross_entropy(logits, labels)
+        loss_t2i = F.cross_entropy(logits.t(), labels)
+        return (loss_i2t + loss_t2i) / 2
+    def text_encode(self, text, tokenize=True):
+        if tokenize:
+            tokens = self.tokenizer.tokenize(text)
+        else:
+            tokens = text
+        text_features = self.text_encoder(tokens)
+        if text_features.dim() < 2:
+            text_features = text_features.unsqueeze(0)
+        return text_features
+    def forward(self, image, text, tokenize=True):
+        image_features = self.image_encoder(image)
+        text_features = self.text_encoder(text, tokenize)
+        if image_features.dim() < 2:
+            image_features = image_features.unsqueeze(0)
+        if text_features.dim() < 2:
+            text_features = text_features.unsqueeze(0)
+        return image_features, text_features

clip/models/ko_clip.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import torch.nn as nn
+from transformers import AutoModel, AutoTokenizer
+class KoCLIPWrapper(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.model_name = "Bingsu/clip-vit-base-patch32-ko"
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+        self.model = AutoModel.from_pretrained(self.model_name)
+    def loss(self, inputs):
+        outputs = self(inputs)
+        return outputs.loss
+    def text_encode(self, text, tokenize):
+        if tokenize:
+            tokens = self.tokenizer(text, padding='max_length', max_length=77, truncation=True, return_tensors="pt")
+        else:
+            tokens = text
+        tokens = tokens.to(self.model.device)
+        return self.model.get_text_features(**tokens)
+    def forward(self, inputs):
+        outputs = self.model(**inputs, return_loss=True)
+        return outputs # [1, 512] , [1, 512]

configs/composite_clip_config.yaml ADDED Viewed

	@@ -0,0 +1,15 @@

+text_encoder:
+    number_of_features: 512
+    number_of_heads: 8
+    number_of_transformer_layers: 6
+    context_length: 77
+    embed_dim: 128
+image_encoder:
+    in_channels: 3
+    resolution: 256
+    patch_size: 16
+    number_of_features: 768
+    number_of_heads: 12
+    number_of_transformer_layers: 4
+    embed_dim: 128

configs/composite_config.yaml ADDED Viewed

	@@ -0,0 +1,38 @@

+encoder:
+    in_channels: 3
+    resolution: 256
+    channels: 128
+    channel_multipliers: [1, 2, 4]
+    z_channels: 3
+    num_res_blocks: 2
+    dropout: 0.0
+decoder:
+    in_channels: 3
+    out_channels: 3
+    resolution: 256
+    channels: 128
+    channel_multipliers: [1, 2, 4]
+    z_channels: 6
+    num_res_blocks: 2
+    dropout: 0.0
+vae:
+    embed_dim: 3
+    kld_weight: 1e-6
+sampler:
+    beta: 'sigmoid'
+    T: 1000
+    sampling_T: 50
+    eta: 1
+cond_encoder:
+    embed_dim: 512
+    cond_dim: 768
+    cond_drop_prob: 0.2
+unet:
+    dim: 192
+    dim_mults: [1, 2, 4, 8]
+    cond_dim: 768

diffusion_model/models/clip_latent_diffusion_model.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import torch
+import torch.nn as nn
+from auto_encoder.models.variational_auto_encoder import VariationalAutoEncoder
+from diffusion_model.models.latent_diffusion_model import LatentDiffusionModel
+from clip.models.clip import CLIP
+class CLIPLatentDiffusionModel(LatentDiffusionModel) :
+    def __init__(self, network : nn.Module, sampler : nn.Module,
+                 auto_encoder : VariationalAutoEncoder, clip : CLIP, image_shape):
+        super().__init__(network, sampler, auto_encoder, image_shape)
+        self.clip = clip
+        self.clip.eval()
+        for param in self.clip.parameters():
+            param.requires_grad = False
+    def loss(self, x0, text):
+        text = self.clip.text_encode(text, tokenize=False)
+        x0 = self.auto_encoder.encode(x0).sample()
+        eps = torch.randn_like(x0)
+        t = torch.randint(0, self.T, (x0.size(0),), device = x0.device)
+        x_t = self.sampler.q_sample(x0, t, eps)
+        eps_hat = self.network(x=x_t, t=t, y=text)
+        return self.weighted_loss(t, eps, eps_hat)
+    @torch.no_grad()
+    def forward(self, text, n_samples : int = 4):
+        text = self.clip.text_encode(text)
+        text = text.repeat(n_samples, 1)
+        x_T = torch.randn(n_samples, *self.latent_shape, device = next(self.buffers(), None).device )
+        sample = self.sampler(x_T = x_T, y=text)
+        return self.auto_encoder.decode(sample)
+    @torch.no_grad()
+    def generate_sequence(self, text, n_samples : int = 4):
+        text = self.clip.text_encode(text)
+        text = text.repeat(n_samples, 1)
+        x_T = torch.randn(n_samples, *self.latent_shape, device = next(self.buffers(), None).device )
+        sample_sequence = self.sampler.reverse_process(x_T, y = text, only_last=False)
+        return sample_sequence

diffusion_model/models/diffusion_model.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import torch
+import torch.nn as nn
+from einops import reduce
+from helper.util import extract
+class DiffusionModel(nn.Module) :
+    def __init__(self, network : nn.Module, sampler : nn.Module, image_shape):
+        super().__init__()
+        self.add_module('sampler', sampler)
+        self.add_module('network', network)
+        self.sampler.set_network(network)
+        self.T = sampler.T
+        self.image_shape = image_shape
+        # loss weight
+        alpha_bar = self.sampler.alpha_bar
+        snr = alpha_bar / (1 - alpha_bar)
+        clipped_snr = snr.clone()
+        clipped_snr.clamp_(max = 5)
+        self.register_buffer('loss_weight', clipped_snr / snr)
+    def weighted_loss(self, t, eps, eps_hat):
+        loss = nn.functional.mse_loss(eps, eps_hat, reduction='none')
+        loss = reduce(loss, 'b ... -> b', 'mean')
+        loss = loss * extract(self.loss_weight, t, loss.shape)
+        return loss.mean()
+    def loss(self, x0, **kwargs):
+        eps = torch.randn_like(x0)
+        t = torch.randint(0, self.T, (x0.size(0),), device = x0.device)
+        x_t = self.sampler.q_sample(x0, t, eps)
+        eps_hat = self.network(x = x_t, t = t, **kwargs)
+        return self.weighted_loss(t, eps, eps_hat)
+    @torch.no_grad()
+    def forward(self, n_samples: int = 4, only_last: bool = True, gamma = None, **kwargs):
+        """
+        If only_last is False, the outputs will be the sequnece of the generated points
+        """
+        x_T = torch.randn(n_samples, *self.image_shape, device = next(self.buffers(), None).device)
+        return self.sampler(x_T = x_T, only_last=only_last, gamma = gamma, **kwargs)

diffusion_model/models/latent_diffusion_model.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import torch
+import torch.nn as nn
+from auto_encoder.models.variational_auto_encoder import VariationalAutoEncoder
+from diffusion_model.models.diffusion_model import DiffusionModel
+class LatentDiffusionModel(DiffusionModel) :
+    def __init__(self, network : nn.Module, sampler : nn.Module, auto_encoder : VariationalAutoEncoder):
+        super().__init__(network, sampler, None)
+        self.auto_encoder = auto_encoder
+        self.auto_encoder.eval()
+        for param in self.auto_encoder.parameters():
+            param.requires_grad = False
+        # The image shape is the latent shape
+        self.image_shape = [*self.auto_encoder.decoder.z_shape[1:]]
+        self.image_shape[0] = self.auto_encoder.embed_dim
+    def loss(self, x0, **kwargs):
+        x0 = self.auto_encoder.encode(x0).sample()
+        eps = torch.randn_like(x0)
+        t = torch.randint(0, self.T, (x0.size(0),), device = x0.device)
+        x_t = self.sampler.q_sample(x0, t, eps)
+        eps_hat = self.network(x = x_t, t = t, **kwargs)
+        return self.weighted_loss(t, eps, eps_hat)
+    # The forward function outputs the generated latents
+    # Therefore, sample() should be used for sampling data, not latents
+    @torch.no_grad()
+    def sample(self, n_samples: int = 4, gamma = None, **kwargs):
+        sample = self(n_samples, gamma=gamma, **kwargs)
+        return self.auto_encoder.decode(sample)
+    @torch.no_grad()
+    def generate_sequence(self, n_samples: int = 4, gamma = None, **kwargs):
+        sequence = self(n_samples, only_last=False, gamma = gamma, **kwargs)
+        sample = self.auto_encoder.decode(sequence[-1])
+        return sequence, sample

diffusion_model/network/attention.py ADDED Viewed

	@@ -0,0 +1,187 @@

+# https://github.com/lucidrains/denoising-diffusion-pytorch/blob/main/denoising_diffusion_pytorch/attend.py
+from functools import wraps
+from packaging import version
+from collections import namedtuple
+import torch
+from torch import nn, einsum
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from functools import partial
+# constants
+AttentionConfig = namedtuple('AttentionConfig', ['enable_flash', 'enable_math', 'enable_mem_efficient'])
+# helpers
+def exists(val):
+    return val is not None
+def default(val, d):
+    return val if exists(val) else d
+def once(fn):
+    called = False
+    @wraps(fn)
+    def inner(x):
+        nonlocal called
+        if called:
+            return
+        called = True
+        return fn(x)
+    return inner
+print_once = once(print)
+class RMSNorm(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.g = nn.Parameter(torch.ones(1, dim, 1, 1))
+    def forward(self, x):
+        return F.normalize(x, dim = 1) * self.g * (x.shape[1] ** 0.5)
+# main class
+class Attend(nn.Module):
+    def __init__(
+        self,
+        dropout = 0.,
+        flash = False,
+        scale = None
+    ):
+        super().__init__()
+        self.dropout = dropout
+        self.scale = scale
+        self.attn_dropout = nn.Dropout(dropout)
+        self.flash = flash
+        assert not (flash and version.parse(torch.__version__) < version.parse('2.0.0')), 'in order to use flash attention, you must be using pytorch 2.0 or above'
+        # determine efficient attention configs for cuda and cpu
+        self.cpu_config = AttentionConfig(True, True, True)
+        self.cuda_config = None
+        if not torch.cuda.is_available() or not flash:
+            return
+        device_properties = torch.cuda.get_device_properties(torch.device('cuda'))
+        device_version = version.parse(f'{device_properties.major}.{device_properties.minor}')
+        if device_version > version.parse('8.0'):
+            print_once('A100 GPU detected, using flash attention if input tensor is on cuda')
+            self.cuda_config = AttentionConfig(True, False, False)
+        else:
+            print_once('Non-A100 GPU detected, using math or mem efficient attention if input tensor is on cuda')
+            self.cuda_config = AttentionConfig(False, True, True)
+    def flash_attn(self, q, k, v):
+        _, heads, q_len, _, k_len, is_cuda, device = *q.shape, k.shape[-2], q.is_cuda, q.device
+        if exists(self.scale):
+            default_scale = q.shape[-1]
+            q = q * (self.scale / default_scale)
+        q, k, v = map(lambda t: t.contiguous(), (q, k, v))
+        # Check if there is a compatible device for flash attention
+        config = self.cuda_config if is_cuda else self.cpu_config
+        # pytorch 2.0 flash attn: q, k, v, mask, dropout, causal, softmax_scale
+        with torch.backends.cuda.sdp_kernel(**config._asdict()):
+            out = F.scaled_dot_product_attention(
+                q, k, v,
+                dropout_p = self.dropout if self.training else 0.
+            )
+        return out
+    def forward(self, q, k, v):
+        """
+        einstein notation
+        b - batch
+        h - heads
+        n, i, j - sequence length (base sequence length, source, target)
+        d - feature dimension
+        """
+        q_len, k_len, device = q.shape[-2], k.shape[-2], q.device
+        if self.flash:
+            return self.flash_attn(q, k, v)
+        scale = default(self.scale, q.shape[-1] ** -0.5)
+        # similarity
+        sim = einsum(f"b h i d, b h j d -> b h i j", q, k) * scale
+        # attention
+        attn = sim.softmax(dim = -1)
+        attn = self.attn_dropout(attn)
+        # aggregate values
+        out = einsum(f"b h i j, b h j d -> b h i d", attn, v)
+        return out
+class LinearAttention(nn.Module):
+    def __init__(self, dim, heads = 4, dim_head = 32):
+        super().__init__()
+        self.scale = dim_head ** -0.5
+        self.heads = heads
+        hidden_dim = dim_head * heads
+        self.to_qkv = nn.Conv2d(dim, hidden_dim * 3, 1, bias = False)
+        self.to_out = nn.Sequential(
+            nn.Conv2d(hidden_dim, dim, 1),
+            RMSNorm(dim)
+        )
+    def forward(self, x):
+        b, c, h, w = x.shape
+        qkv = self.to_qkv(x).chunk(3, dim = 1)
+        q, k, v = map(lambda t: rearrange(t, 'b (h c) x y -> b h c (x y)', h = self.heads), qkv)
+        q = q.softmax(dim = -2)
+        k = k.softmax(dim = -1)
+        q = q * self.scale
+        context = torch.einsum('b h d n, b h e n -> b h d e', k, v)
+        out = torch.einsum('b h d e, b h d n -> b h e n', context, q)
+        out = rearrange(out, 'b h c (x y) -> b (h c) x y', h = self.heads, x = h, y = w)
+        return self.to_out(out)
+class Attention(nn.Module):
+    def __init__(self, dim, heads = 4, dim_head = 32):
+        super().__init__()
+        self.scale = dim_head ** -0.5
+        self.heads = heads
+        hidden_dim = dim_head * heads
+        self.to_qkv = nn.Conv2d(dim, hidden_dim * 3, 1, bias = False)
+        self.to_out = nn.Conv2d(hidden_dim, dim, 1)
+    def forward(self, x):
+        b, c, h, w = x.shape
+        qkv = self.to_qkv(x).chunk(3, dim = 1)
+        q, k, v = map(lambda t: rearrange(t, 'b (h c) x y -> b h c (x y)', h = self.heads), qkv)
+        q = q * self.scale
+        sim = einsum('b h d i, b h d j -> b h i j', q, k)
+        attn = sim.softmax(dim = -1)
+        out = einsum('b h i j, b h d j -> b h i d', attn, v)
+        out = rearrange(out, 'b h (x y) d -> b (h d) x y', x = h, y = w)
+        return self.to_out(out)

diffusion_model/network/blocks.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import torch
+import torch.nn as nn
+class AdaptiveGroupNorm(nn.Module):
+    def __init__(self, num_groups, num_channels, emb_dim, eps=1e-5):
+        super().__init__()
+        self.num_groups = num_groups
+        self.num_channels = num_channels
+        self.eps = eps
+        # Use a standard GroupNorm, but without learnable affine parameters
+        self.norm = nn.GroupNorm(num_groups, num_channels, eps=eps, affine=False)
+        # Linear layers to project the embedding to gamma and beta
+        self.gamma_proj = nn.Linear(emb_dim, num_channels)
+        self.beta_proj = nn.Linear(emb_dim, num_channels)
+    def forward(self, x, emb):
+        """
+        Args:
+            x: Input tensor of shape [B, C, H, W].
+            emb: Embedding tensor of shape [B, emb_dim].
+        Returns:
+            Normalized tensor with adaptive scaling and shifting.
+        """
+        # Normalize as usual with GroupNorm
+        normalized = self.norm(x)
+        # Get gamma and beta from the embedding
+        gamma = self.gamma_proj(emb)
+        beta = self.beta_proj(emb)
+        # Reshape for broadcasting: [B, C] -> [B, C, 1, 1]
+        gamma = gamma.view(-1, self.num_channels, 1, 1)
+        beta = beta.view(-1, self.num_channels, 1, 1)
+        # Apply adaptive scaling and shifting
+        return gamma * normalized + beta
+class DepthwiseSeparableConv2d(nn.Module):
+    def __init__(self, dim_in, dim_out, kernel_size, padding):
+        super().__init__()
+        self.depthwise = nn.Conv2d(dim_in, dim_in, kernel_size, padding=padding, groups=dim_in)
+        self.pointwise = nn.Conv2d(dim_in, dim_out, 1)  # 1x1 convolution
+    def forward(self, x):
+        x = self.depthwise(x)
+        x = self.pointwise(x)
+        return x
+class Block(nn.Module):
+    def __init__(self, dim, dim_out, groups, emb_dim, dropout=0.0, use_depthwise=False):
+        super().__init__()
+        self.norm = AdaptiveGroupNorm(groups, dim, emb_dim)
+        if use_depthwise:
+            self.proj = DepthwiseSeparableConv2d(dim, dim_out, kernel_size=3, padding=1)
+        else:
+            self.proj = nn.Conv2d(dim, dim_out, 3, padding=1)
+        self.act = nn.SiLU()
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x, emb):
+        x = self.norm(x, emb)  # Pre-normalization
+        x = self.proj(x)
+        x = self.act(x)
+        return self.dropout(x)
+class ResnetBlock(nn.Module):
+    def __init__(self, dim: int, dim_out: int, t_emb_dim: int, *,
+                y_emb_dim: int = None, groups: int = 32, dropout: float = 0.0, residual_scale=1.0):
+        super().__init__()
+        if y_emb_dim is None:
+            y_emb_dim = 0
+        emb_dim = t_emb_dim + y_emb_dim
+        self.block1 = Block(dim, dim_out, groups, emb_dim, dropout)  # Pass emb_dim
+        self.block2 = Block(dim_out, dim_out, groups, emb_dim, dropout) # Pass emb_dim
+        self.res_conv = nn.Conv2d(dim, dim_out, 1) if dim != dim_out else nn.Identity()
+        self.residual_scale = nn.Parameter(torch.tensor(residual_scale))
+    def forward(self, x, t_emb, y_emb=None):
+        cond_emb = t_emb
+        if y_emb is not None:
+            cond_emb = torch.cat([cond_emb, y_emb], dim=-1)
+        h = self.block1(x, cond_emb)  # Pass combined embedding to Block
+        h = self.block2(h, cond_emb)  # Pass combined embedding to Block
+        return self.residual_scale * h + self.res_conv(x)  # Scale the residual

diffusion_model/network/timestep_embedding.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import torch
+import torch.nn as nn
+import math
+class SinusoidalEmbedding(nn.Module):
+    def __init__(self, embed_dim : int, theta : int = 10000):
+        """
+        Creates sinusoidal embeddings for timesteps.
+        Args:
+            embed_dim: The dimensionality of the embedding.
+            theta: The base for the log-spaced frequencies.
+        """
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.theta = theta
+    def forward(self, x):
+        """
+        Computes sinusoidal embeddings for the input timesteps.
+        Args:
+            x: A 1D torch.Tensor of timesteps (shape: [batch_size]).
+        Returns:
+            A torch.Tensor of sinusoidal embeddings (shape: [batch_size, embed_dim]).
+        """
+        assert isinstance(x, torch.Tensor) # Input must be a torch.Tensor
+        assert x.ndim == 1 # Input must be a 1D tensor
+        assert isinstance(self.embed_dim, int) and self.embed_dim > 0 # embed_dim must be a positive integer
+        half_dim = self.embed_dim // 2
+        # Create a sequence of log-spaced frequencies
+        embeddings = math.log(self.theta) / (half_dim - 1)
+        embeddings = torch.exp(torch.arange(half_dim, device=x.device) * -embeddings)
+        # Outer product: timesteps x frequencies
+        embeddings = x[:, None] * embeddings[None, :]
+        embeddings = torch.cat((embeddings.sin(), embeddings.cos()), dim=-1)
+        # Handle odd embedding dimensions
+        if self.embed_dim % 2 == 1:
+            embeddings = torch.cat([embeddings, torch.zeros_like(embeddings[:, :1])], dim=-1)
+        return embeddings

diffusion_model/network/unet.py ADDED Viewed

	@@ -0,0 +1,217 @@

+# https://github.com/lucidrains/denoising-diffusion-pytorch/blob/main/denoising_diffusion_pytorch/denoising_diffusion_pytorch.py
+from functools import partial
+import torch
+from torch import nn
+from torch.nn import Module, ModuleList
+from diffusion_model.network.attention import LinearAttention, Attention
+from diffusion_model.network.timestep_embedding import SinusoidalEmbedding
+from diffusion_model.network.blocks import ResnetBlock
+def exists(x):
+    return x is not None
+def default(val, d):
+    if exists(val):
+        return val
+    return d() if callable(d) else d
+def cast_tuple(t, length = 1):
+    if isinstance(t, tuple):
+        return t
+    return ((t,) * length)
+def divisible_by(numer, denom):
+    return (numer % denom) == 0
+# small helper modules
+class DownSample(nn.Module):
+    def __init__(self, dim: int, dim_out: int):
+        """
+        Downsamples the spatial dimensions by a factor of 2 using a strided convolution.
+        Args:
+            dim: Input channel dimension.
+        """
+        super().__init__()
+        self.downsample = nn.Conv2d(dim, dim_out, kernel_size=4, stride=2, padding=1)
+    def forward(self, x: torch.tensor) -> torch.tensor:
+        """
+        Forward pass.
+        Args:
+            x: Input tensor of shape [B, C, H, W].
+        Returns:
+            Downsampled tensor of shape [B, C, H/2, W/2].
+        """
+        return self.downsample(x)
+class UpSample(nn.Module):
+    def __init__(self, dim: int, dim_out: int):
+        """
+        Upsamples the spatial dimensions by a factor of 2 using a transposed convolution.
+        Args:
+            dim: Input channel dimension.
+        """
+        super().__init__()
+        self.upsample = nn.ConvTranspose2d(dim, dim_out, kernel_size=4, stride=2, padding=1)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Forward pass.
+        Args:
+            x: Input tensor of shape [B, C, H, W].
+        Returns:
+            Upsampled tensor of shape [B, C, 2*H, 2*W].
+        """
+        return self.upsample(x)
+# model
+class Unet(Module):
+    def __init__(
+        self,
+        dim,
+        init_dim = None,
+        out_dim = None,
+        cond_dim = None,
+        dim_mults = (1, 2, 4, 8),
+        channels = 3,
+        dropout = 0.,
+        attn_dim_head = 32,
+        attn_heads = 4,
+        full_attn = None,    # defaults to full attention only for inner most layer
+    ):
+        super().__init__()
+        # determine dimensions
+        self.channels = channels
+        input_channels = channels
+        init_dim = default(init_dim, dim)
+        self.init_conv = nn.Conv2d(input_channels, init_dim, 7, padding = 3)
+        dims = [*map(lambda m: dim * m, dim_mults)]
+        in_out = list(zip(dims[:-1], dims[1:]))
+        # time embeddings
+        time_dim = dim * 4
+        sinu_pos_emb = SinusoidalEmbedding(dim)
+        self.time_mlp = nn.Sequential(
+            sinu_pos_emb,
+            nn.Linear(dim, time_dim),
+            nn.GELU(),
+            nn.Linear(time_dim, time_dim)
+        )
+        # attention
+        if not full_attn:
+            full_attn = (*((False,) * (len(dim_mults) - 1)), True)
+        num_stages = len(dim_mults)
+        full_attn  = cast_tuple(full_attn, num_stages)
+        attn_heads = cast_tuple(attn_heads, num_stages)
+        attn_dim_head = cast_tuple(attn_dim_head, num_stages)
+        assert len(full_attn) == len(dim_mults)
+        # prepare blocks
+        FullAttention = Attention
+        resnet_block = partial(ResnetBlock,
+                               t_emb_dim = time_dim, y_emb_dim = cond_dim, dropout = dropout)
+        # layers
+        self.downs = ModuleList([])
+        self.ups = ModuleList([])
+        num_resolutions = len(in_out)
+        for ind, ((dim_in, dim_out), layer_full_attn, layer_attn_heads, layer_attn_dim_head) in enumerate(zip(in_out, full_attn, attn_heads, attn_dim_head)):
+            is_last = ind >= (num_resolutions - 1)
+            attn_klass = FullAttention if layer_full_attn else LinearAttention
+            self.downs.append(ModuleList([
+                resnet_block(dim_in, dim_in),
+                resnet_block(dim_in, dim_in),
+                attn_klass(dim_in, dim_head = layer_attn_dim_head, heads = layer_attn_heads),
+                DownSample(dim_in, dim_out) if not is_last else nn.Conv2d(dim_in, dim_out, 3, padding = 1)
+            ]))
+        mid_dim = dims[-1]
+        self.mid_block1 = resnet_block(mid_dim, mid_dim)
+        self.mid_attn = FullAttention(mid_dim, heads = attn_heads[-1], dim_head = attn_dim_head[-1])
+        self.mid_block2 = resnet_block(mid_dim, mid_dim)
+        for ind, ((dim_in, dim_out), layer_full_attn, layer_attn_heads, layer_attn_dim_head) in enumerate(zip(*map(reversed, (in_out, full_attn, attn_heads, attn_dim_head)))):
+            is_last = ind == (len(in_out) - 1)
+            attn_klass = FullAttention if layer_full_attn else LinearAttention
+            self.ups.append(ModuleList([
+                resnet_block(dim_out + dim_in, dim_out),
+                resnet_block(dim_out + dim_in, dim_out),
+                attn_klass(dim_out, dim_head = layer_attn_dim_head, heads = layer_attn_heads),
+                UpSample(dim_out, dim_in) if not is_last else  nn.Conv2d(dim_out, dim_in, 3, padding = 1)
+            ]))
+        default_out_dim = channels
+        self.out_dim = default(out_dim, default_out_dim)
+        self.final_res_block = resnet_block(init_dim * 2, init_dim)
+        self.final_conv = nn.Conv2d(init_dim, self.out_dim, 1)
+    @property
+    def downsample_factor(self):
+        return 2 ** (len(self.downs) - 1)
+    def forward(self, x, t, y = None):
+        assert all([divisible_by(d, self.downsample_factor) for d in x.shape[-2:]]), f'your input dimensions {x.shape[-2:]} need to be divisible by {self.downsample_factor}, given the unet'
+        x = self.init_conv(x)
+        r = x.clone()
+        t = self.time_mlp(t)
+        h = []
+        for block1, block2, attn, downsample in self.downs:
+            x = block1(x, t, y)
+            h.append(x)
+            x = block2(x, t, y)
+            x = attn(x) + x
+            h.append(x)
+            x = downsample(x)
+        x = self.mid_block1(x, t, y)
+        x = self.mid_attn(x) + x
+        x = self.mid_block2(x, t, y)
+        for block1, block2, attn, upsample in self.ups:
+            x = torch.cat((x, h.pop()), dim = 1)
+            x = block1(x, t, y)
+            x = torch.cat((x, h.pop()), dim = 1)
+            x = block2(x, t, y)
+            x = attn(x) + x
+            x = upsample(x)
+        x = torch.cat((x, r), dim = 1)
+        x = self.final_res_block(x, t, y)
+        return self.final_conv(x)

diffusion_model/network/unet_wrapper.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import torch
+import torch.nn as nn
+import yaml
+import transformers
+class UnetWrapper(nn.Module):
+    def __init__(self, Unet: nn.Module, config_path: str,
+                 cond_encoder = None):
+        super().__init__()
+        with open(config_path, "r") as file:
+            config = yaml.safe_load(file)['unet']
+        self.add_module('network', Unet(**config))
+        # ConditionalEncoder
+        self.add_module('cond_encoder', cond_encoder)
+    def forward(self, x, t, y=None, cond_drop_all:bool = False):
+        if t.dim() == 0:
+            t = x.new_full((x.size(0), ), t, dtype = torch.int, device = x.device)
+        if y is not None:
+            assert self.cond_encoder is not None, 'You need to set ConditionalEncoder for conditional sampling.'
+            if isinstance(y, str) or isinstance(y, transformers.tokenization_utils_base.BatchEncoding):
+                y = self.cond_encoder(y, cond_drop_all=cond_drop_all).to(x.device)
+            else:
+                if torch.is_tensor(y) == False:
+                    y = torch.tensor([y], device=x.device)
+                y = self.cond_encoder(y, cond_drop_all=cond_drop_all).squeeze()
+            if y.size(0) != x.size(0):
+                y = y.repeat(x.size(0), 1)
+            return self.network(x, t, y)
+        else:
+            return self.network(x, t)

diffusion_model/sampler/base_sampler.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import torch
+import torch.nn as nn
+from tqdm import tqdm
+import yaml
+from helper.util import extract
+from helper.beta_generator import BetaGenerator
+from abc import ABC, abstractmethod
+class BaseSampler(nn.Module, ABC):
+    def __init__(self, config_path : str):
+        super().__init__()
+        with open(config_path, "r") as file:
+            self.config = yaml.safe_load(file)['sampler']
+        self.T = self.config['T']
+        beta_generator = BetaGenerator(T=self.T)
+        self.timesteps = None
+        self.register_buffer('beta', getattr(beta_generator,
+                                              f"{self.config['beta']}_beta_schedule",
+                                              beta_generator.linear_beta_schedule)())
+        self.register_buffer('alpha', 1 - self.beta)
+        self.register_buffer('alpha_sqrt', self.alpha.sqrt())
+        self.register_buffer('alpha_bar', torch.cumprod(self.alpha, dim = 0))
+    @abstractmethod
+    @torch.no_grad()
+    def get_x_prev(self, x, t, idx, eps_hat):
+        pass
+    def set_network(self, network : nn.Module):
+        self.network = network
+    def q_sample(self, x0, t, eps = None):
+        alpha_t_bar = extract(self.alpha_bar, t, x0.shape)
+        if eps is None:
+            eps = torch.randn_like(x0)
+        q_xt_x0 = alpha_t_bar.sqrt() * x0 + (1 - alpha_t_bar).sqrt() * eps
+        return q_xt_x0
+    @torch.no_grad()
+    def reverse_process(self, x_T, only_last=True, **kwargs):
+        x = x_T
+        if only_last:
+            for i, t in tqdm(enumerate(reversed(self.timesteps))):
+                idx = len(self.timesteps) - i - 1
+                x = self.p_sample(x, t, idx, **kwargs)
+            return x
+        else:
+            x_seq = []
+            x_seq.append(x)
+            for i, t in tqdm(enumerate(reversed(self.timesteps))):
+                idx = len(self.timesteps) - i - 1
+                x_seq.append(self.p_sample(x_seq[-1], t, idx, **kwargs))
+            return x_seq
+    @torch.no_grad()
+    def p_sample(self, x, t, idx, gamma = None, **kwargs):
+        eps_hat = self.network(x = x, t = t, **kwargs)
+        if gamma is not None:
+            eps_null = self.network(x = x, t = t, cond_drop_all=True, **kwargs)
+            eps_hat = gamma * eps_hat + (1 - gamma) * eps_null
+        x = self.get_x_prev(x, idx, eps_hat)
+        return x
+    @torch.no_grad()
+    def forward(self, x_T, **kwargs):
+        return self.reverse_process(x_T, **kwargs)

diffusion_model/sampler/ddim.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import torch
+from diffusion_model.sampler.base_sampler import BaseSampler
+class DDIM(BaseSampler):
+    def __init__(self, config_path):
+        super().__init__(config_path)
+        self.sampling_T = self.config['sampling_T']
+        step = self.T // self.sampling_T
+        self.timesteps = torch.arange(0, self.T, step, dtype=torch.int)
+        self.ddim_alpha = self.alpha_bar[self.timesteps]
+        self.sqrt_one_minus_alpha_bar = (1. - self.ddim_alpha).sqrt()
+        self.alpha_bar_prev = torch.cat([self.ddim_alpha[0:1], self.ddim_alpha[:-1]])
+        self.sigma = (self.config['eta'] *
+                            torch.sqrt((1-self.alpha_bar_prev) / (1-self.ddim_alpha) *
+                            (1 - self.ddim_alpha / self.alpha_bar_prev)))
+    def get_x_prev(self, x, tau, eps_hat) :
+        alpha_prev = self.alpha_bar_prev[tau]
+        sigma = self.sigma[tau]
+        x0_hat = (x - self.sqrt_one_minus_alpha_bar[tau] * eps_hat) \
+           / (self.ddim_alpha[tau] ** 0.5)
+        dir_xt = (1. - alpha_prev - sigma ** 2).sqrt() * eps_hat
+        if sigma == 0. : noise = 0.
+        else : noise = torch.randn_like(x, device = x.device)
+        x = alpha_prev.sqrt() * x0_hat + dir_xt + sigma * noise
+        return x

diffusion_model/sampler/ddpm.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import torch
+from diffusion_model.sampler.base_sampler import BaseSampler
+class DDPM(BaseSampler):
+    def __init__(self, config_path):
+        super().__init__(config_path)
+        self.timesteps = torch.arange(0, self.T, dtype=torch.int)
+        self.sqrt_one_minus_alpha_bar = (1. - self.alpha_bar).sqrt()
+        self.alpha_bar_prev = torch.cat([self.alpha_bar[0:1], self.alpha_bar[:-1]])
+        self.sigma = (((1 - self.alpha_bar_prev) / (1 - self.alpha_bar)) * self.beta).sqrt()
+    @torch.no_grad()
+    def get_x_prev(self, x, t, eps_hat):
+        x = (1 / self.alpha_sqrt[t]) \
+           * (x - (self.beta[t] / self.sqrt_one_minus_alpha_bar[t] * eps_hat))
+        z = torch.randn_like(x) if t > 0 else 0.
+        x = x + self.sigma[t] * z
+        return x

helper/beta_generator.py ADDED Viewed

	@@ -0,0 +1,47 @@

+# https://github.com/lucidrains/denoising-diffusion-pytorch/blob/main/denoising_diffusion_pytorch/denoising_diffusion_pytorch.py#L442
+import torch
+import math
+class BetaGenerator():
+    def __init__(self, T) :
+        self.T = T
+    def fixed_beta_schedule(self, beta) :
+        betas = torch.Tensor.repeat(torch.Tensor([beta]) , self.T)
+        return betas
+    def linear_beta_schedule(self):
+        """
+        linear schedule, proposed in original ddpm paper
+        """
+        scale = 1000 / self.T
+        beta_start = scale * 0.0001
+        beta_end = scale * 0.02
+        return torch.linspace(beta_start, beta_end, self.T)
+    def cosine_beta_schedule(self, s = 0.008):
+        """
+        cosine schedule
+        as proposed in https://openreview.net/forum?id=-NEXDKk8gZ
+        """
+        steps = self.T + 1
+        t = torch.linspace(0, self.T, steps, dtype = torch.float32) / self.T
+        alphas_cumprod = torch.cos((t + s) / (1 + s) * math.pi * 0.5) ** 2
+        alphas_cumprod = alphas_cumprod / alphas_cumprod[0]
+        betas = 1 - (alphas_cumprod[1:] / alphas_cumprod[:-1])
+        return torch.clip(betas, 0, 0.999)
+    def sigmoid_beta_schedule(self, start = -3, end = 3, tau = 1):
+        """
+        sigmoid schedule
+        proposed in https://arxiv.org/abs/2212.11972 - Figure 8
+        better for images > 64x64, when used during training
+        """
+        steps = self.T + 1
+        t = torch.linspace(0, self.T, steps, dtype = torch.float32) / self.T
+        v_start = torch.tensor(start / tau).sigmoid()
+        v_end = torch.tensor(end / tau).sigmoid()
+        alphas_cumprod = (-((t * (end - start) + start) / tau).sigmoid() + v_end) / (v_end - v_start)
+        alphas_cumprod = alphas_cumprod / alphas_cumprod[0]
+        betas = 1 - (alphas_cumprod[1:] / alphas_cumprod[:-1])
+        return torch.clip(betas, 0, 0.999)

helper/cond_encoder.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import torch
+import torch.nn as nn
+import yaml
+class BaseCondEncoder(nn.Module):
+    def __init__(
+        self,
+        config_path
+        ):
+        super().__init__()
+        with open(config_path, "r") as file:
+            self.config = yaml.safe_load(file)['cond_encoder']
+        self.embed_dim = self.config['embed_dim']
+        self.cond_dim = self.config['cond_dim']
+        if 'cond_drop_prob' in self.config:
+            self.cond_drop_prob = self.config['cond_drop_prob']
+            self.null_embedding = nn.Parameter(torch.randn(self.embed_dim))
+        else:
+            self.cond_drop_prob = 0.0
+        self.cond_mlp = nn.Sequential(
+            nn.Linear(self.embed_dim, self.cond_dim),
+            nn.GELU(),
+            nn.Linear(self.cond_dim, self.cond_dim)
+            )
+    def cond_drop(self, y: torch.tensor):
+        if self.training and self.cond_drop_prob > 0.0:
+            flags = torch.zeros((y.size(0), ), device=y.device).float().uniform_(0, 1) < self.cond_drop_prob
+            y[flags] = self.null_embedding.to(y.dtype)
+        return y
+class CLIPEncoder(BaseCondEncoder):
+    def __init__(
+        self,
+        clip,
+        config_path
+        ):
+        super().__init__(config_path)
+        self.clip = clip
+        self.clip.eval()
+        for param in self.clip.parameters():
+            param.requires_grad = False
+    def forward(self, y, cond_drop_all:bool = False):
+        if isinstance(y, str):
+            y = self.clip.text_encode(y, tokenize=True)
+        else:
+            y = self.clip.text_encode(y, tokenize=False)
+        y = self.cond_drop(y) # Only training
+        if cond_drop_all:
+            y[:] = self.null_embedding
+        return self.cond_mlp(y)
+class ClassEncoder(BaseCondEncoder):
+    def __init__(
+        self,
+        config_path
+        ):
+        super().__init__(config_path)
+        self.num_cond = self.config['num_cond']
+        self.embed = nn.Embedding(self.num_cond, self.embed_dim)
+    def forward(self, y, cond_drop_all:bool = False):
+        y = self.embed(y)
+        y = self.cond_drop(y) # Only training
+        if cond_drop_all:
+            y[:] = self.null_embedding
+        return self.cond_mlp(y)

helper/data_generator.py ADDED Viewed

	@@ -0,0 +1,129 @@

+from torchvision.datasets import CIFAR10, CelebA
+from torch.utils.data import DataLoader, Dataset
+from torchvision.transforms import Compose, ToTensor, Lambda, CenterCrop, Resize, RandomHorizontalFlip
+import os
+import torch
+import json
+from PIL import Image as im
+from helper.tokenizer import Tokenizer
+from transformers import AutoProcessor
+def center_crop_and_resize(img, crop_size, resize_size):
+    width, height = img.size
+    # 1. Center Crop
+    left = (width - crop_size) / 2
+    top = (height - crop_size) / 2
+    right = (width + crop_size) / 2
+    bottom = (height + crop_size) / 2
+    img_cropped = img.crop((left, top, right, bottom))
+    # 2. Resize
+    img_resized = img_cropped.resize((resize_size, resize_size), im.Resampling.BICUBIC)
+    return img_resized
+class UnlabelDataset(Dataset):
+    def __init__(self, path, transform):
+        self.path = path
+        self.file_list = os.listdir(path)
+        self.transform = transform
+    def __len__(self) :
+        return len(self.file_list)
+    def __getitem__(self, index):
+        img_path = self.path + self.file_list[index]
+        image = im.open(img_path)
+        image = self.transform(image)
+        return image
+class CompositeDataset(Dataset):
+    def __init__(self, path, text_path, processor: AutoProcessor = None):
+        self.path = path
+        self.text_path = text_path
+        self.tokenizer = Tokenizer()
+        self.processor = processor
+        self.file_numbers = os.listdir(path)
+        self.file_numbers = [ os.path.splitext(filename)[0] for filename in self.file_numbers ]
+        self.transform = Compose([
+                ToTensor(),
+                CenterCrop(400),
+                Resize(256, antialias=True),
+                RandomHorizontalFlip(),
+                Lambda(lambda x: (x - 0.5) * 2)
+            ])
+    def __len__(self) :
+        return len(self.file_numbers)
+    def get_text(self, text_path):
+        with open(text_path, encoding = 'CP949') as f:
+            text = json.load(f)['description']['impression']['description']
+        return text
+    def __getitem__(self, idx) :
+        img_path = self.path + self.file_numbers[idx] + '.png'
+        text_path = self.text_path + self.file_numbers[idx] + '.json'
+        image = im.open(img_path)
+        text = self.get_text(text_path)
+        if self.processor is not None:
+            image = center_crop_and_resize(image, 400, 256)
+            inputs = self.processor(
+                text=text,
+                images=image,
+                return_tensors="pt",
+                padding='max_length',
+                max_length=77,
+                truncation=True,
+                )
+            for j in inputs:
+                inputs[j] = inputs[j].squeeze(0)
+            return inputs
+        else:
+            image = self.transform(image)
+            text = self.tokenizer.tokenize(text)
+            for j in text:
+                text[j] = text[j].squeeze(0)
+            return image, text
+class DataGenerator():
+    def __init__(self, num_workers: int = 4, pin_memory: bool = True):
+        self.transform = Compose([
+            ToTensor(),
+            Lambda(lambda x: (x - 0.5) * 2)
+            ])
+        self.num_workers = num_workers
+        self.pin_memory = pin_memory
+    def cifar10(self, path = './datasets', batch_size : int = 64, train : bool = True):
+        train_data = CIFAR10(path, download = True, train = train, transform = self.transform)
+        dl = DataLoader(train_data, batch_size, shuffle = True, num_workers=self.num_workers, pin_memory=self.pin_memory)
+        return dl
+    def celeba(self, path = './datasets', batch_size : int = 16):
+        train_data = CelebA(path, transform = Compose([
+            ToTensor(),
+            CenterCrop(178),
+            Resize(128),
+            Lambda(lambda x: (x - 0.5) * 2)
+            ]))
+        dl = DataLoader(train_data, batch_size, shuffle = True, num_workers=self.num_workers, pin_memory=self.pin_memory)
+        return dl
+    def composite(self, path, text_path, batch_size : int = 16, is_process: bool = False):
+        processor = None
+        if is_process:
+            model_name = "Bingsu/clip-vit-base-patch32-ko"
+            processor = AutoProcessor.from_pretrained(model_name, use_fast=False)
+        dataset = CompositeDataset(path, text_path, processor)
+        return DataLoader(dataset, batch_size=batch_size, shuffle=True,
+                          num_workers=self.num_workers, pin_memory=self.pin_memory)
+    def random_data(self, size, batch_size : int = 4):
+        train_data = torch.randn(size)
+        return DataLoader(train_data, batch_size)

helper/ema.py ADDED Viewed

	@@ -0,0 +1,375 @@

+# https://github.com/lucidrains/ema-pytorch/tree/main
+from __future__ import annotations
+from typing import Callable
+from copy import deepcopy
+from functools import partial
+import torch
+from torch import nn, Tensor
+from torch.nn import Module
+def exists(val):
+    return val is not None
+def divisible_by(num, den):
+    return (num % den) == 0
+def get_module_device(m: Module):
+    return next(m.parameters()).device
+def maybe_coerce_dtype(t, dtype):
+    if t.dtype == dtype:
+        return t
+    return t.to(dtype)
+def inplace_copy(tgt: Tensor, src: Tensor, *, auto_move_device = False, coerce_dtype = False):
+    if auto_move_device:
+        src = src.to(tgt.device)
+    if coerce_dtype:
+        src = maybe_coerce_dtype(src, tgt.dtype)
+    tgt.copy_(src)
+def inplace_lerp(tgt: Tensor, src: Tensor, weight, *, auto_move_device = False, coerce_dtype = False):
+    if auto_move_device:
+        src = src.to(tgt.device)
+    if coerce_dtype:
+        src = maybe_coerce_dtype(src, tgt.dtype)
+    tgt.lerp_(src, weight)
+class EMA(Module):
+    """
+    Implements exponential moving average shadowing for your model.
+    Utilizes an inverse decay schedule to manage longer term training runs.
+    By adjusting the power, you can control how fast EMA will ramp up to your specified beta.
+    @crowsonkb's notes on EMA Warmup:
+    If gamma=1 and power=1, implements a simple average. gamma=1, power=2/3 are
+    good values for models you plan to train for a million or more steps (reaches decay
+    factor 0.999 at 31.6K steps, 0.9999 at 1M steps), gamma=1, power=3/4 for models
+    you plan to train for less (reaches decay factor 0.999 at 10K steps, 0.9999 at
+    215.4k steps).
+    Args:
+        inv_gamma (float): Inverse multiplicative factor of EMA warmup. Default: 1.
+        power (float): Exponential factor of EMA warmup. Default: 2/3.
+        min_value (float): The minimum EMA decay rate. Default: 0.
+    """
+    def __init__(
+        self,
+        model: Module,
+        ema_model: Module | Callable[[], Module] | None = None,             # if your model has lazylinears or other types of non-deepcopyable modules, you can pass in your own ema model
+        beta = 0.9999,
+        update_after_step = 100,
+        update_every = 10,
+        inv_gamma = 1.0,
+        power = 2 / 3,
+        min_value = 0.0,
+        param_or_buffer_names_no_ema: set[str] = set(),
+        ignore_names: set[str] = set(),
+        ignore_startswith_names: set[str] = set(),
+        include_online_model = True,                  # set this to False if you do not wish for the online model to be saved along with the ema model (managed externally)
+        allow_different_devices = False,              # if the EMA model is on a different device (say CPU), automatically move the tensor
+        use_foreach = False,
+        update_model_with_ema_every = None,           # update the model with EMA model weights every number of steps, for better continual learning https://arxiv.org/abs/2406.02596
+        update_model_with_ema_beta = 0.,              # amount of model weight to keep when updating to EMA (hare to tortoise)
+        forward_method_names: tuple[str, ...] = (),
+        move_ema_to_online_device = False,
+        coerce_dtype = False,
+        lazy_init_ema = False,
+    ):
+        super().__init__()
+        self.beta = beta
+        self.is_frozen = beta == 1.
+        # whether to include the online model within the module tree, so that state_dict also saves it
+        self.include_online_model = include_online_model
+        if include_online_model:
+            self.online_model = model
+        else:
+            self.online_model = [model] # hack
+        # handle callable returning ema module
+        if not isinstance(ema_model, Module) and callable(ema_model):
+            ema_model = ema_model()
+        # ema model
+        self.ema_model = None
+        self.forward_method_names = forward_method_names
+        if not lazy_init_ema:
+            self.init_ema(ema_model)
+        else:
+            assert not exists(ema_model)
+        # tensor update functions
+        self.inplace_copy = partial(inplace_copy, auto_move_device = allow_different_devices, coerce_dtype = coerce_dtype)
+        self.inplace_lerp = partial(inplace_lerp, auto_move_device = allow_different_devices, coerce_dtype = coerce_dtype)
+        # updating hyperparameters
+        self.update_every = update_every
+        self.update_after_step = update_after_step
+        self.inv_gamma = inv_gamma
+        self.power = power
+        self.min_value = min_value
+        assert isinstance(param_or_buffer_names_no_ema, (set, list))
+        self.param_or_buffer_names_no_ema = param_or_buffer_names_no_ema # parameter or buffer
+        self.ignore_names = ignore_names
+        self.ignore_startswith_names = ignore_startswith_names
+        # continual learning related
+        self.update_model_with_ema_every = update_model_with_ema_every
+        self.update_model_with_ema_beta = update_model_with_ema_beta
+        # whether to manage if EMA model is kept on a different device
+        self.allow_different_devices = allow_different_devices
+        # whether to coerce dtype when copy or lerp from online to EMA model
+        self.coerce_dtype = coerce_dtype
+        # whether to move EMA model to online model device automatically
+        self.move_ema_to_online_device = move_ema_to_online_device
+        # whether to use foreach
+        if use_foreach:
+            assert hasattr(torch, '_foreach_lerp_') and hasattr(torch, '_foreach_copy_'), 'your version of torch does not have the prerequisite foreach functions'
+        self.use_foreach = use_foreach
+        # init and step states
+        self.register_buffer('initted', torch.tensor(False))
+        self.register_buffer('step', torch.tensor(0))
+    def init_ema(
+        self,
+        ema_model: Module | None = None
+    ):
+        self.ema_model = ema_model
+        if not exists(self.ema_model):
+            try:
+                self.ema_model = deepcopy(self.model)
+            except Exception as e:
+                print(f'Error: While trying to deepcopy model: {e}')
+                print('Your model was not copyable. Please make sure you are not using any LazyLinear')
+                exit()
+        for p in self.ema_model.parameters():
+            p.detach_()
+        # forwarding methods
+        for forward_method_name in self.forward_method_names:
+            fn = getattr(self.ema_model, forward_method_name)
+            setattr(self, forward_method_name, fn)
+        # parameter and buffer names
+        self.parameter_names = {name for name, param in self.ema_model.named_parameters() if torch.is_floating_point(param) or torch.is_complex(param)}
+        self.buffer_names = {name for name, buffer in self.ema_model.named_buffers() if torch.is_floating_point(buffer) or torch.is_complex(buffer)}
+    def add_to_optimizer_post_step_hook(self, optimizer):
+        assert hasattr(optimizer, 'register_step_post_hook')
+        def hook(*_):
+            self.update()
+        return optimizer.register_step_post_hook(hook)
+    @property
+    def model(self):
+        return self.online_model if self.include_online_model else self.online_model[0]
+    def eval(self):
+        return self.ema_model.eval()
+    @torch.no_grad()
+    def forward_eval(self, *args, **kwargs):
+        # handy function for invoking ema model with no grad + eval
+        training = self.ema_model.training
+        out = self.ema_model(*args, **kwargs)
+        self.ema_model.train(training)
+        return out
+    def restore_ema_model_device(self):
+        device = self.initted.device
+        self.ema_model.to(device)
+    def get_params_iter(self, model):
+        for name, param in model.named_parameters():
+            if name not in self.parameter_names:
+                continue
+            yield name, param
+    def get_buffers_iter(self, model):
+        for name, buffer in model.named_buffers():
+            if name not in self.buffer_names:
+                continue
+            yield name, buffer
+    def copy_params_from_model_to_ema(self):
+        copy = self.inplace_copy
+        for (_, ma_params), (_, current_params) in zip(self.get_params_iter(self.ema_model), self.get_params_iter(self.model)):
+            copy(ma_params.data, current_params.data)
+        for (_, ma_buffers), (_, current_buffers) in zip(self.get_buffers_iter(self.ema_model), self.get_buffers_iter(self.model)):
+            copy(ma_buffers.data, current_buffers.data)
+    def copy_params_from_ema_to_model(self):
+        copy = self.inplace_copy
+        for (_, ma_params), (_, current_params) in zip(self.get_params_iter(self.ema_model), self.get_params_iter(self.model)):
+            copy(current_params.data, ma_params.data)
+        for (_, ma_buffers), (_, current_buffers) in zip(self.get_buffers_iter(self.ema_model), self.get_buffers_iter(self.model)):
+            copy(current_buffers.data, ma_buffers.data)
+    def update_model_with_ema(self, decay = None):
+        if not exists(decay):
+            decay = self.update_model_with_ema_beta
+        if decay == 0.:
+            return self.copy_params_from_ema_to_model()
+        self.update_moving_average(self.model, self.ema_model, decay)
+    def get_current_decay(self):
+        epoch = (self.step - self.update_after_step - 1).clamp(min = 0.)
+        value = 1 - (1 + epoch / self.inv_gamma) ** - self.power
+        if epoch.item() <= 0:
+            return 0.
+        return value.clamp(min = self.min_value, max = self.beta).item()
+    def update(self):
+        step = self.step.item()
+        self.step += 1
+        if not self.initted.item():
+            if not exists(self.ema_model):
+                self.init_ema()
+            self.copy_params_from_model_to_ema()
+            self.initted.data.copy_(torch.tensor(True))
+            return
+        should_update = divisible_by(step, self.update_every)
+        if should_update and step <= self.update_after_step:
+            self.copy_params_from_model_to_ema()
+            return
+        if should_update:
+            self.update_moving_average(self.ema_model, self.model)
+        if exists(self.update_model_with_ema_every) and divisible_by(step, self.update_model_with_ema_every):
+            self.update_model_with_ema()
+    @torch.no_grad()
+    def update_moving_average(self, ma_model, current_model, current_decay = None):
+        if self.is_frozen:
+            return
+        # move ema model to online model device if not same and needed
+        if self.move_ema_to_online_device and get_module_device(ma_model) != get_module_device(current_model):
+            ma_model.to(get_module_device(current_model))
+        # get current decay
+        if not exists(current_decay):
+            current_decay = self.get_current_decay()
+        # store all source and target tensors to copy or lerp
+        tensors_to_copy = []
+        tensors_to_lerp = []
+        # loop through parameters
+        for (name, current_params), (_, ma_params) in zip(self.get_params_iter(current_model), self.get_params_iter(ma_model)):
+            if name in self.ignore_names:
+                continue
+            if any([name.startswith(prefix) for prefix in self.ignore_startswith_names]):
+                continue
+            if name in self.param_or_buffer_names_no_ema:
+                tensors_to_copy.append((ma_params.data, current_params.data))
+                continue
+            tensors_to_lerp.append((ma_params.data, current_params.data))
+        # loop through buffers
+        for (name, current_buffer), (_, ma_buffer) in zip(self.get_buffers_iter(current_model), self.get_buffers_iter(ma_model)):
+            if name in self.ignore_names:
+                continue
+            if any([name.startswith(prefix) for prefix in self.ignore_startswith_names]):
+                continue
+            if name in self.param_or_buffer_names_no_ema:
+                tensors_to_copy.append((ma_buffer.data, current_buffer.data))
+                continue
+            tensors_to_lerp.append((ma_buffer.data, current_buffer.data))
+        # execute inplace copy or lerp
+        if not self.use_foreach:
+            for tgt, src in tensors_to_copy:
+                self.inplace_copy(tgt, src)
+            for tgt, src in tensors_to_lerp:
+                self.inplace_lerp(tgt, src, 1. - current_decay)
+        else:
+            # use foreach if available and specified
+            if self.allow_different_devices:
+                tensors_to_copy = [(tgt, src.to(tgt.device)) for tgt, src in tensors_to_copy]
+                tensors_to_lerp = [(tgt, src.to(tgt.device)) for tgt, src in tensors_to_lerp]
+            if self.coerce_dtype:
+                tensors_to_copy = [(tgt, maybe_coerce_dtype(src, tgt.dtype)) for tgt, src in tensors_to_copy]
+                tensors_to_lerp = [(tgt, maybe_coerce_dtype(src, tgt.dtype)) for tgt, src in tensors_to_lerp]
+            if len(tensors_to_copy) > 0:
+                tgt_copy, src_copy = zip(*tensors_to_copy)
+                torch._foreach_copy_(tgt_copy, src_copy)
+            if len(tensors_to_lerp) > 0:
+                tgt_lerp, src_lerp = zip(*tensors_to_lerp)
+                torch._foreach_lerp_(tgt_lerp, src_lerp, 1. - current_decay)
+    def __call__(self, *args, **kwargs):
+        return self.ema_model(*args, **kwargs)

helper/loader.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import torch
+import torch.nn as nn
+from helper.ema import EMA
+from transformers import get_cosine_schedule_with_warmup
+class Loader():
+    def __init__(self, device = None):
+        self.device = device
+    def print_model(self, check_point):
+        print("Epoch: " + str(check_point["epoch"]))
+        print("Training step: " + str(check_point["training_steps"]))
+        print("Best loss: " + str(check_point["best_loss"]))
+        print("Batch size: " + str(check_point["batch_size"]))
+        print("Number of batches: " + str(check_point["number_of_batches"]))
+    def model_load(self, file_name : str, model : nn.Module,
+             print_dict : bool = True, is_ema: bool = True):
+        check_point = torch.load(file_name + ".pth", map_location=self.device,
+                                 weights_only=True)
+        if print_dict: self.print_model(check_point)
+        if is_ema:
+            model = EMA(model)
+            model.load_state_dict(check_point['ema_state_dict'])
+            model = model.ema_model
+        else:
+            model.load_state_dict(check_point['model_state_dict'])
+        model.eval()
+        print("===Model loaded!===")
+        return model
+    def load_for_training(self, file_name: str, model: nn.Module, print_dict: bool = True):
+        check_point = torch.load(file_name + ".pth", map_location=self.device,
+                                 weights_only=True)
+        if print_dict: self.print_model(check_point)
+        model.load_state_dict(check_point['model_state_dict'])
+        model.train()
+        ema = EMA(model)
+        ema.load_state_dict(check_point['ema_state_dict'])
+        ema.train()
+        optimizer = torch.optim.AdamW(model.parameters(), lr = 1e-4)
+        optimizer.load_state_dict(check_point["optimizer_state_dict"])
+        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=100)
+        scheduler.load_state_dict(check_point["scheduler_state_dict"])
+        epoch = check_point["epoch"]
+        loss = check_point["best_loss"]
+        print("===Model/EMA/Optimizer/Scheduler/Epoch/Loss loaded!===")
+        return model, ema, optimizer, scheduler, epoch, loss

helper/painter.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import io
+from PIL import Image as im
+from tqdm import tqdm
+import matplotlib.pyplot as plt
+import torch
+import numpy as np
+class Painter(object):
+    def __init__(self) :
+        pass
+    def show_images(self, images, title : str = '', index : bool = False, cmap = None, show = True):
+        images = images.permute(0, 2, 3, 1)
+        if type(images) is torch.Tensor:
+            images = images.detach().cpu().numpy()
+        images = np.clip(images / 2 + 0.5, 0, 1)
+        fig = plt.figure(figsize=(8, 8))
+        rows = int(len(images) ** (1 / 2))
+        cols = round(len(images) / rows)
+        idx = 0
+        for _ in range(rows):
+            for _ in range(cols):
+                fig.add_subplot(rows, cols, idx + 1)
+                if idx < len(images):
+                    plt.imshow(images[idx], cmap = cmap)
+                    if index :
+                        plt.title(idx + 1)
+                    plt.axis('off')
+                    idx += 1
+        fig.suptitle(title, fontsize=30)
+        if show:
+            plt.show()
+    def show_first_batch(self, loader):
+        for batch in loader:
+            self.show_images(images = batch, title = "First Batch")
+            break
+    def make_gif(self, images, file_name):
+        imgs = []
+        for i in tqdm(range(len(images))):
+            img_buf = io.BytesIO()
+            self.show_images(images[i], title = 't = ' + str(i), show=False)
+            plt.savefig(img_buf, format='png')
+            imgs.append(im.open(img_buf))
+        imgs[0].save(file_name + '.gif', format='GIF', append_images=imgs, save_all=True, duration=1, loop=0)
+        plt.close('all')

helper/tokenizer.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from transformers import AutoTokenizer
+class Tokenizer:
+    def __init__(self, model_name="Bingsu/clip-vit-base-patch32-ko"):
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.vocab_size = self.tokenizer.vocab_size
+    def tokenize(self, text):
+        return self.tokenizer(text, padding='max_length', max_length=77, truncation=True, return_tensors='pt')

helper/trainer.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import torch
+import torch.nn as nn
+from torch.utils.data import DataLoader
+from accelerate import Accelerator
+from tqdm import tqdm
+from typing import Callable
+from helper.ema import EMA
+class Trainer():
+    def __init__(self,
+                 model: nn.Module,
+                 loss_fn: Callable,
+                 ema: EMA = None,
+                 optimizer: torch.optim.Optimizer = None,
+                 scheduler: torch.optim.lr_scheduler = None,
+                 start_epoch = 0,
+                 best_loss = float("inf"),
+                 accumulation_steps: int = 1,
+                 max_grad_norm: float = 1.0):
+        self.accelerator = Accelerator(mixed_precision = 'fp16', gradient_accumulation_steps=accumulation_steps)
+        self.model = model.to(self.accelerator.device)
+        if ema is None:
+            self.ema = EMA(self.model).to(self.accelerator.device)
+        else:
+            self.ema = ema.to(self.accelerator.device)
+        self.loss_fn = loss_fn
+        self.optimizer = optimizer
+        if self.optimizer is None:
+            self.optimizer = torch.optim.AdamW(self.model.parameters(), lr = 1e-4)
+        self.scheduler = scheduler
+        if self.scheduler is None:
+            self.scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(self.optimizer, T_max=100)
+        self.start_epoch = start_epoch
+        self.best_loss = best_loss
+        self.accumulation_steps = accumulation_steps
+        self.max_grad_norm = max_grad_norm
+    def train(self, dl : DataLoader, epochs: int, file_name : str, no_label : bool = False):
+        self.model.train()
+        self.model, self.optimizer, data_loader, self.scheduler = self.accelerator.prepare(
+            self.model, self.optimizer, dl, self.scheduler
+            )
+        for epoch in range(self.start_epoch + 1, epochs + 1):
+            epoch_loss = 0.0
+            progress_bar = tqdm(data_loader, leave=False, desc=f"Epoch {epoch}/{epochs}", colour="#005500", disable = not self.accelerator.is_local_main_process)
+            for step, batch in enumerate(progress_bar):
+                with self.accelerator.accumulate(self.model):  # Context manager for accumulation
+                    if no_label:
+                        if isinstance(batch, list):
+                            x = batch[0].to(self.accelerator.device)
+                        else:
+                            x = batch.to(self.accelerator.device)
+                    else:
+                        x, y = batch[0].to(self.accelerator.device), batch[1].to(self.accelerator.device)
+                    with self.accelerator.autocast():
+                        if no_label:
+                            loss = self.loss_fn(x)
+                        else:
+                            loss = self.loss_fn(x, y=y)
+                    # Normalize the loss
+                    self.accelerator.backward(loss)
+                    # Gradient Clipping:
+                    if self.max_grad_norm is not None and self.accelerator.sync_gradients:
+                        self.accelerator.clip_grad_norm_(self.model.parameters(), self.max_grad_norm)
+                    # Only step optimizer and scheduler when we have accumulated enough
+                    self.optimizer.step()
+                    self.ema.update()
+                    self.optimizer.zero_grad()
+                    epoch_loss += loss.item()
+                    progress_bar.set_postfix(loss=epoch_loss / (min(step + 1, len(data_loader)))) # Correct progress bar update
+            self.accelerator.wait_for_everyone()
+            if self.accelerator.is_main_process:
+                epoch_loss = epoch_loss / len(progress_bar)
+                self.scheduler.step()
+                log_string = f"Loss at epoch {epoch}: {epoch_loss :.4f}"
+                # Save the best model
+                if self.best_loss > epoch_loss:
+                    self.best_loss = epoch_loss
+                    torch.save({
+                        "model_state_dict": self.accelerator.get_state_dict(self.model),
+                        "ema_state_dict": self.ema.state_dict(),
+                        "optimizer_state_dict": self.optimizer.state_dict(),
+                        "scheduler_state_dict": self.scheduler.state_dict(),
+                        "epoch": epoch,
+                        "training_steps": epoch * len(dl),
+                        "best_loss": self.best_loss,
+                        "batch_size": dl.batch_size,
+                        "number_of_batches": len(dl)
+                        }, file_name + '.pth')
+                    log_string += " --> Best model ever (stored)"
+                print(log_string)

helper/util.py ADDED Viewed

	@@ -0,0 +1,4 @@

+def extract(a, t, x_shape):
+    b, *_ = t.shape
+    out = a.gather(-1, t)
+    return out.reshape(b, *((1,) * (len(x_shape) - 1)))