Spaces:

mehdidc
/

text_to_image_ddgan

Runtime error

App Files Files Community

Mehdi Cherti commited on Apr 2, 2023

Commit

e96a195

1 Parent(s): 2ab447a

update

Browse files

Files changed (12) hide show

EMA.py +8 -0
eval_all.sh +12 -2
run.py +28 -1
score_sde/models/discriminator.py +2 -2
score_sde/models/layers.py +1 -0
score_sde/models/projected_discriminator.py +783 -0
scripts/init.sh +14 -2
scripts/run_jurecadc_conda.sh +23 -0
scripts/run_juwelsbooster_conda.sh +19 -0
test.py +8 -0
test_ddgan.py +7 -2
train_ddgan.py +20 -4

EMA.py CHANGED Viewed

@@ -21,8 +21,16 @@ class EMA(Optimizer):
         self.optimizer = opt
         self.state = opt.state
         self.param_groups = opt.param_groups
     def step(self, *args, **kwargs):
         retval = self.optimizer.step(*args, **kwargs)
         # stop here if we are not applying EMA

         self.optimizer = opt
         self.state = opt.state
         self.param_groups = opt.param_groups
+        self.defaults = {}
     def step(self, *args, **kwargs):
+        # for group in self.optimizer.param_groups:
+            # group.setdefault('amsgrad', False)
+            # group.setdefault('maximize', False)
+            # group.setdefault('foreach', None)
+            # group.setdefault('capturable', False)
+            # group.setdefault('differentiable', False)
+            # group.setdefault('fused', False)
         retval = self.optimizer.step(*args, **kwargs)
         # stop here if we are not applying EMA

eval_all.sh CHANGED Viewed

@@ -1,7 +1,17 @@
 #!/bin/bash
-for model in ddgan_sd_v10 ddgan_laion2b_v2 ddgan_ddb_v1 ddgan_ddb_v2 ddgan_ddb_v3;do
-    if [ "$model" == "$ddgan_ddb_v3" ]; then
         bs=32
     else
         bs=64
     fi

 #!/bin/bash
+#for model in ddgan_sd_v10 ddgan_laion2b_v2 ddgan_ddb_v1 ddgan_ddb_v2 ddgan_ddb_v3 ddgan_ddb_v4;do
+#for model in ddgan_ddb_v2 ddgan_ddb_v3 ddgan_ddb_v4 ddgan_ddb_v5;do
+#for model in ddgan_ddb_v4 ddgan_ddb_v6 ddgan_ddb_v7 ddgan_laion_aesthetic_v15;do
+#for model in ddgan_ddb_v6;do
+for model in ddgan_laion_aesthetic_v15;do
+    if [ "$model" == "ddgan_ddb_v3" ]; then
         bs=32
+    elif [ "$model" == "ddgan_laion_aesthetic_v15" ]; then
+        bs=32
+    elif [ "$model" == "ddgan_ddb_v6" ]; then
+        bs=32
+    elif [ "$model" == "ddgan_ddb_v4" ]; then
+        bs=16
     else
         bs=64
     fi

run.py CHANGED Viewed

@@ -256,6 +256,28 @@ def ddgan_ddb_v3():
     cfg['model']['num_timesteps'] = 2
     return cfg
 models = [
     ddgan_cifar10_cond17, # cifar10, cross attn for discr
     ddgan_cifar10_cond18, # cifar10, xl encoder
@@ -283,6 +305,7 @@ models = [
     ddgan_laion_aesthetic_v12,
     ddgan_laion_aesthetic_v13,
     ddgan_laion_aesthetic_v14,
     ddgan_laion2b_v1,
     ddgan_sd_v1,
     ddgan_sd_v2,
@@ -298,7 +321,11 @@ models = [
     ddgan_laion2b_v2,
     ddgan_ddb_v1,
     ddgan_ddb_v2,
-    ddgan_ddb_v3
 ]
 def get_model(model_name):

     cfg['model']['num_timesteps'] = 2
     return cfg
+def ddgan_ddb_v4():
+    cfg = ddgan_ddb_v1()
+    cfg['model']['num_channels_dae'] = 256
+    cfg['model']['num_timesteps'] = 2
+    return cfg
+def ddgan_ddb_v5():
+    cfg = ddgan_ddb_v2()
+    return cfg
+def ddgan_ddb_v6():
+    cfg = ddgan_ddb_v3()
+    return cfg
+def ddgan_ddb_v7():
+    cfg = ddgan_ddb_v1()
+    return cfg
+def ddgan_laion_aesthetic_v15():
+    cfg = ddgan_ddb_v3()
+    return cfg
 models = [
     ddgan_cifar10_cond17, # cifar10, cross attn for discr
     ddgan_cifar10_cond18, # cifar10, xl encoder
     ddgan_laion_aesthetic_v12,
     ddgan_laion_aesthetic_v13,
     ddgan_laion_aesthetic_v14,
+    ddgan_laion_aesthetic_v15,
     ddgan_laion2b_v1,
     ddgan_sd_v1,
     ddgan_sd_v2,
     ddgan_laion2b_v2,
     ddgan_ddb_v1,
     ddgan_ddb_v2,
+    ddgan_ddb_v3,
+    ddgan_ddb_v4,
+    ddgan_ddb_v5,
+    ddgan_ddb_v6,
+    ddgan_ddb_v7,
 ]
 def get_model(model_name):

score_sde/models/discriminator.py CHANGED Viewed

@@ -181,7 +181,7 @@ class SmallCondAttnDiscriminator(nn.Module):
         hidden_dim=t_emb_dim,
         output_dim=t_emb_dim,
         act=act,
-        )
@@ -368,7 +368,7 @@ class CondAttnDiscriminator(nn.Module):
             hidden_dim=t_emb_dim,
             output_dim=t_emb_dim,
             act=act,
-        )
     self.start_conv = conv2d(nc,ngf*2,1, padding=0)
     self.conv1 = DownConvBlock(ngf*2, ngf*4, t_emb_dim = t_emb_dim, downsample = True, act=act)

         hidden_dim=t_emb_dim,
         output_dim=t_emb_dim,
         act=act,
+    )
             hidden_dim=t_emb_dim,
             output_dim=t_emb_dim,
             act=act,
+    )
     self.start_conv = conv2d(nc,ngf*2,1, padding=0)
     self.conv1 = DownConvBlock(ngf*2, ngf*4, t_emb_dim = t_emb_dim, downsample = True, act=act)

score_sde/models/layers.py CHANGED Viewed

@@ -559,6 +559,7 @@ class CondAttnBlock(nn.Module):
     h = h.permute(0,2,1)
     h = h.contiguous()
     h_new = self.ca(h, cond, mask=mask)
     h_new = h_new.permute(0,2,1)
     h_new = h_new.contiguous()
     h_new = h_new.view(B, C, H, W)

     h = h.permute(0,2,1)
     h = h.contiguous()
     h_new = self.ca(h, cond, mask=mask)
+    # print(h_new.min(), h_new.max())
     h_new = h_new.permute(0,2,1)
     h_new = h_new.contiguous()
     h_new = h_new.view(B, C, H, W)

score_sde/models/projected_discriminator.py ADDED Viewed

	@@ -0,0 +1,783 @@

+from functools import partial
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+#from pg_modules.blocks import DownBlock, DownBlockPatch, conv2d
+import functools
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.utils import spectral_norm
+from . import layers
+from .layers import CondAttnBlock
+from .discriminator import *
+def conv2d(*args, **kwargs):
+    return spectral_norm(nn.Conv2d(*args, **kwargs))
+def convTranspose2d(*args, **kwargs):
+    return spectral_norm(nn.ConvTranspose2d(*args, **kwargs))
+def embedding(*args, **kwargs):
+    return spectral_norm(nn.Embedding(*args, **kwargs))
+def linear(*args, **kwargs):
+    return spectral_norm(nn.Linear(*args, **kwargs))
+def NormLayer(c, mode='batch'):
+    if mode == 'group':
+        return nn.GroupNorm(c//2, c)
+    elif mode == 'batch':
+        return nn.BatchNorm2d(c)
+### Activations
+class GLU(nn.Module):
+    def forward(self, x):
+        nc = x.size(1)
+        assert nc % 2 == 0, 'channels dont divide 2!'
+        nc = int(nc/2)
+        return x[:, :nc] * torch.sigmoid(x[:, nc:])
+class Swish(nn.Module):
+    def forward(self, feat):
+        return feat * torch.sigmoid(feat)
+### Upblocks
+class InitLayer(nn.Module):
+    def __init__(self, nz, channel, sz=4):
+        super().__init__()
+        self.init = nn.Sequential(
+            convTranspose2d(nz, channel*2, sz, 1, 0, bias=False),
+            NormLayer(channel*2),
+            GLU(),
+        )
+    def forward(self, noise):
+        noise = noise.view(noise.shape[0], -1, 1, 1)
+        return self.init(noise)
+def UpBlockSmall(in_planes, out_planes):
+    block = nn.Sequential(
+        nn.Upsample(scale_factor=2, mode='nearest'),
+        conv2d(in_planes, out_planes*2, 3, 1, 1, bias=False),
+        NormLayer(out_planes*2), GLU())
+    return block
+class UpBlockSmallCond(nn.Module):
+    def __init__(self, in_planes, out_planes, z_dim):
+        super().__init__()
+        self.in_planes = in_planes
+        self.out_planes = out_planes
+        self.up = nn.Upsample(scale_factor=2, mode='nearest')
+        self.conv = conv2d(in_planes, out_planes*2, 3, 1, 1, bias=False)
+        which_bn = functools.partial(CCBN, which_linear=linear, input_size=z_dim)
+        self.bn = which_bn(2*out_planes)
+        self.act = GLU()
+    def forward(self, x, c):
+        x = self.up(x)
+        x = self.conv(x)
+        x = self.bn(x, c)
+        x = self.act(x)
+        return x
+def UpBlockBig(in_planes, out_planes):
+    block = nn.Sequential(
+        nn.Upsample(scale_factor=2, mode='nearest'),
+        conv2d(in_planes, out_planes*2, 3, 1, 1, bias=False),
+        NoiseInjection(),
+        NormLayer(out_planes*2), GLU(),
+        conv2d(out_planes, out_planes*2, 3, 1, 1, bias=False),
+        NoiseInjection(),
+        NormLayer(out_planes*2), GLU()
+        )
+    return block
+class UpBlockBigCond(nn.Module):
+    def __init__(self, in_planes, out_planes, z_dim):
+        super().__init__()
+        self.in_planes = in_planes
+        self.out_planes = out_planes
+        self.up = nn.Upsample(scale_factor=2, mode='nearest')
+        self.conv1 = conv2d(in_planes, out_planes*2, 3, 1, 1, bias=False)
+        self.conv2 = conv2d(out_planes, out_planes*2, 3, 1, 1, bias=False)
+        which_bn = functools.partial(CCBN, which_linear=linear, input_size=z_dim)
+        self.bn1 = which_bn(2*out_planes)
+        self.bn2 = which_bn(2*out_planes)
+        self.act = GLU()
+        self.noise = NoiseInjection()
+    def forward(self, x, c):
+        # block 1
+        x = self.up(x)
+        x = self.conv1(x)
+        x = self.noise(x)
+        x = self.bn1(x, c)
+        x = self.act(x)
+        # block 2
+        x = self.conv2(x)
+        x = self.noise(x)
+        x = self.bn2(x, c)
+        x = self.act(x)
+        return x
+class SEBlock(nn.Module):
+    def __init__(self, ch_in, ch_out):
+        super().__init__()
+        self.main = nn.Sequential(
+            nn.AdaptiveAvgPool2d(4),
+            conv2d(ch_in, ch_out, 4, 1, 0, bias=False),
+            Swish(),
+            conv2d(ch_out, ch_out, 1, 1, 0, bias=False),
+            nn.Sigmoid(),
+        )
+    def forward(self, feat_small, feat_big):
+        return feat_big * self.main(feat_small)
+### Downblocks
+class SeparableConv2d(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, bias=False):
+        super(SeparableConv2d, self).__init__()
+        self.depthwise = conv2d(in_channels, in_channels, kernel_size=kernel_size,
+            groups=in_channels, bias=bias, padding=1)
+        self.pointwise = conv2d(in_channels, out_channels,
+            kernel_size=1, bias=bias)
+    def forward(self, x):
+        out = self.depthwise(x)
+        out = self.pointwise(out)
+        return out
+class DownBlock(nn.Module):
+    def __init__(self, in_planes, out_planes, separable=False):
+        super().__init__()
+        if not separable:
+            self.main = nn.Sequential(
+                conv2d(in_planes, out_planes, 4, 2, 1),
+                NormLayer(out_planes),
+                nn.LeakyReLU(0.2, inplace=True),
+            )
+        else:
+            self.main = nn.Sequential(
+                SeparableConv2d(in_planes, out_planes, 3),
+                NormLayer(out_planes),
+                nn.LeakyReLU(0.2, inplace=True),
+                nn.AvgPool2d(2, 2),
+            )
+    def forward(self, feat):
+        return self.main(feat)
+class DownBlockPatch(nn.Module):
+    def __init__(self, in_planes, out_planes, separable=False):
+        super().__init__()
+        self.main = nn.Sequential(
+            DownBlock(in_planes, out_planes, separable),
+            conv2d(out_planes, out_planes, 1, 1, 0, bias=False),
+            NormLayer(out_planes),
+            nn.LeakyReLU(0.2, inplace=True),
+        )
+    def forward(self, feat):
+        return self.main(feat)
+### CSM
+class ResidualConvUnit(nn.Module):
+    def __init__(self, cin, activation, bn):
+        super().__init__()
+        self.conv = nn.Conv2d(cin, cin, kernel_size=3, stride=1, padding=1, bias=True)
+        self.skip_add = nn.quantized.FloatFunctional()
+    def forward(self, x):
+        return self.skip_add.add(self.conv(x), x)
+class FeatureFusionBlock(nn.Module):
+    def __init__(self, features, activation, deconv=False, bn=False, expand=False, align_corners=True, lowest=False):
+        super().__init__()
+        self.deconv = deconv
+        self.align_corners = align_corners
+        self.expand = expand
+        out_features = features
+        if self.expand==True:
+            out_features = features//2
+        self.out_conv = nn.Conv2d(features, out_features, kernel_size=1, stride=1, padding=0, bias=True, groups=1)
+        self.skip_add = nn.quantized.FloatFunctional()
+    def forward(self, *xs):
+        output = xs[0]
+        if len(xs) == 2:
+            output = self.skip_add.add(output, xs[1])
+        output = nn.functional.interpolate(
+            output, scale_factor=2, mode="bilinear", align_corners=self.align_corners
+        )
+        output = self.out_conv(output)
+        return output
+### Misc
+class NoiseInjection(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.weight = nn.Parameter(torch.zeros(1), requires_grad=True)
+    def forward(self, feat, noise=None):
+        if noise is None:
+            batch, _, height, width = feat.shape
+            noise = torch.randn(batch, 1, height, width).to(feat.device)
+        return feat + self.weight * noise
+class CCBN(nn.Module):
+    ''' conditional batchnorm '''
+    def __init__(self, output_size, input_size, which_linear, eps=1e-5, momentum=0.1):
+        super().__init__()
+        self.output_size, self.input_size = output_size, input_size
+        # Prepare gain and bias layers
+        self.gain = which_linear(input_size, output_size)
+        self.bias = which_linear(input_size, output_size)
+        # epsilon to avoid dividing by 0
+        self.eps = eps
+        # Momentum
+        self.momentum = momentum
+        self.register_buffer('stored_mean', torch.zeros(output_size))
+        self.register_buffer('stored_var', torch.ones(output_size))
+    def forward(self, x, y):
+        # Calculate class-conditional gains and biases
+        gain = (1 + self.gain(y)).view(y.size(0), -1, 1, 1)
+        bias = self.bias(y).view(y.size(0), -1, 1, 1)
+        out = F.batch_norm(x, self.stored_mean, self.stored_var, None, None,
+                           self.training, 0.1, self.eps)
+        return out * gain + bias
+class Interpolate(nn.Module):
+    """Interpolation module."""
+    def __init__(self, size, mode='bilinear', align_corners=False):
+        """Init.
+        Args:
+            scale_factor (float): scaling
+            mode (str): interpolation mode
+        """
+        super(Interpolate, self).__init__()
+        self.interp = nn.functional.interpolate
+        self.size = size
+        self.mode = mode
+        self.align_corners = align_corners
+    def forward(self, x):
+        """Forward pass.
+        Args:
+            x (tensor): input
+        Returns:
+            tensor: interpolated data
+        """
+        x = self.interp(
+            x,
+            size=self.size,
+            mode=self.mode,
+            align_corners=self.align_corners,
+        )
+        return x
+#from pg_modules.projector import F_RandomProj
+import torch
+import torch.nn as nn
+import timm
+#from pg_modules.blocks import FeatureFusionBlock
+def _make_scratch_ccm(scratch, in_channels, cout, expand=False):
+    # shapes
+    out_channels = [cout, cout*2, cout*4, cout*8] if expand else [cout]*4
+    scratch.layer0_ccm = nn.Conv2d(in_channels[0], out_channels[0], kernel_size=1, stride=1, padding=0, bias=True)
+    scratch.layer1_ccm = nn.Conv2d(in_channels[1], out_channels[1], kernel_size=1, stride=1, padding=0, bias=True)
+    scratch.layer2_ccm = nn.Conv2d(in_channels[2], out_channels[2], kernel_size=1, stride=1, padding=0, bias=True)
+    scratch.layer3_ccm = nn.Conv2d(in_channels[3], out_channels[3], kernel_size=1, stride=1, padding=0, bias=True)
+    scratch.CHANNELS = out_channels
+    return scratch
+def _make_scratch_csm(scratch, in_channels, cout, expand):
+    scratch.layer3_csm = FeatureFusionBlock(in_channels[3], nn.ReLU(False), expand=expand, lowest=True)
+    scratch.layer2_csm = FeatureFusionBlock(in_channels[2], nn.ReLU(False), expand=expand)
+    scratch.layer1_csm = FeatureFusionBlock(in_channels[1], nn.ReLU(False), expand=expand)
+    scratch.layer0_csm = FeatureFusionBlock(in_channels[0], nn.ReLU(False))
+    # last refinenet does not expand to save channels in higher dimensions
+    scratch.CHANNELS = [cout, cout, cout*2, cout*4] if expand else [cout]*4
+    return scratch
+def _make_efficientnet(model):
+    pretrained = nn.Module()
+    pretrained.layer0 = nn.Sequential(model.conv_stem, model.bn1, model.act1, *model.blocks[0:2])
+    pretrained.layer1 = nn.Sequential(*model.blocks[2:3])
+    pretrained.layer2 = nn.Sequential(*model.blocks[3:5])
+    pretrained.layer3 = nn.Sequential(*model.blocks[5:9])
+    return pretrained
+def calc_channels(pretrained, inp_res=224):
+    channels = []
+    tmp = torch.zeros(1, 3, inp_res, inp_res)
+    # forward pass
+    tmp = pretrained.layer0(tmp)
+    channels.append(tmp.shape[1])
+    tmp = pretrained.layer1(tmp)
+    channels.append(tmp.shape[1])
+    tmp = pretrained.layer2(tmp)
+    channels.append(tmp.shape[1])
+    tmp = pretrained.layer3(tmp)
+    channels.append(tmp.shape[1])
+    return channels
+def _make_projector(im_res, cout, proj_type, expand=False):
+    assert proj_type in [0, 1, 2], "Invalid projection type"
+    ### Build pretrained feature network
+    model = timm.create_model('tf_efficientnet_lite0', pretrained=True)
+    pretrained = _make_efficientnet(model)
+    # determine resolution of feature maps, this is later used to calculate the number
+    # of down blocks in the discriminators. Interestingly, the best results are achieved
+    # by fixing this to 256, ie., we use the same number of down blocks per discriminator
+    # independent of the dataset resolution
+    im_res = 256
+    pretrained.RESOLUTIONS = [im_res//4, im_res//8, im_res//16, im_res//32]
+    pretrained.CHANNELS = calc_channels(pretrained)
+    if proj_type == 0: return pretrained, None
+    ### Build CCM
+    scratch = nn.Module()
+    scratch = _make_scratch_ccm(scratch, in_channels=pretrained.CHANNELS, cout=cout, expand=expand)
+    pretrained.CHANNELS = scratch.CHANNELS
+    if proj_type == 1: return pretrained, scratch
+    ### build CSM
+    scratch = _make_scratch_csm(scratch, in_channels=scratch.CHANNELS, cout=cout, expand=expand)
+    # CSM upsamples x2 so the feature map resolution doubles
+    pretrained.RESOLUTIONS = [res*2 for res in pretrained.RESOLUTIONS]
+    pretrained.CHANNELS = scratch.CHANNELS
+    return pretrained, scratch
+class F_RandomProj(nn.Module):
+    def __init__(
+        self,
+        im_res=256,
+        cout=64,
+        expand=True,
+        proj_type=2,  # 0 = no projection, 1 = cross channel mixing, 2 = cross scale mixing
+        **kwargs,
+    ):
+        super().__init__()
+        self.proj_type = proj_type
+        self.cout = cout
+        self.expand = expand
+        # build pretrained feature network and random decoder (scratch)
+        self.pretrained, self.scratch = _make_projector(im_res=im_res, cout=self.cout, proj_type=self.proj_type, expand=self.expand)
+        self.CHANNELS = self.pretrained.CHANNELS
+        self.RESOLUTIONS = self.pretrained.RESOLUTIONS
+    def forward(self, x):
+        # predict feature maps
+        out0 = self.pretrained.layer0(x)
+        out1 = self.pretrained.layer1(out0)
+        out2 = self.pretrained.layer2(out1)
+        out3 = self.pretrained.layer3(out2)
+        # start enumerating at the lowest layer (this is where we put the first discriminator)
+        out = {
+            '0': out0,
+            '1': out1,
+            '2': out2,
+            '3': out3,
+        }
+        if self.proj_type == 0: return out
+        out0_channel_mixed = self.scratch.layer0_ccm(out['0'])
+        out1_channel_mixed = self.scratch.layer1_ccm(out['1'])
+        out2_channel_mixed = self.scratch.layer2_ccm(out['2'])
+        out3_channel_mixed = self.scratch.layer3_ccm(out['3'])
+        out = {
+            '0': out0_channel_mixed,
+            '1': out1_channel_mixed,
+            '2': out2_channel_mixed,
+            '3': out3_channel_mixed,
+        }
+        if self.proj_type == 1: return out
+        # from bottom to top
+        out3_scale_mixed = self.scratch.layer3_csm(out3_channel_mixed)
+        out2_scale_mixed = self.scratch.layer2_csm(out3_scale_mixed, out2_channel_mixed)
+        out1_scale_mixed = self.scratch.layer1_csm(out2_scale_mixed, out1_channel_mixed)
+        out0_scale_mixed = self.scratch.layer0_csm(out1_scale_mixed, out0_channel_mixed)
+        out = {
+            '0': out0_scale_mixed,
+            '1': out1_scale_mixed,
+            '2': out2_scale_mixed,
+            '3': out3_scale_mixed,
+        }
+        return out
+#from pg_modules.diffaug import DiffAugment
+# Differentiable Augmentation for Data-Efficient GAN Training
+# Shengyu Zhao, Zhijian Liu, Ji Lin, Jun-Yan Zhu, and Song Han
+# https://arxiv.org/pdf/2006.10738
+import torch
+import torch.nn.functional as F
+def DiffAugment(x, policy='', channels_first=True):
+    if policy:
+        if not channels_first:
+            x = x.permute(0, 3, 1, 2)
+        for p in policy.split(','):
+            for f in AUGMENT_FNS[p]:
+                x = f(x)
+        if not channels_first:
+            x = x.permute(0, 2, 3, 1)
+        x = x.contiguous()
+    return x
+def rand_brightness(x):
+    x = x + (torch.rand(x.size(0), 1, 1, 1, dtype=x.dtype, device=x.device) - 0.5)
+    return x
+def rand_saturation(x):
+    x_mean = x.mean(dim=1, keepdim=True)
+    x = (x - x_mean) * (torch.rand(x.size(0), 1, 1, 1, dtype=x.dtype, device=x.device) * 2) + x_mean
+    return x
+def rand_contrast(x):
+    x_mean = x.mean(dim=[1, 2, 3], keepdim=True)
+    x = (x - x_mean) * (torch.rand(x.size(0), 1, 1, 1, dtype=x.dtype, device=x.device) + 0.5) + x_mean
+    return x
+def rand_translation(x, ratio=0.125):
+    shift_x, shift_y = int(x.size(2) * ratio + 0.5), int(x.size(3) * ratio + 0.5)
+    translation_x = torch.randint(-shift_x, shift_x + 1, size=[x.size(0), 1, 1], device=x.device)
+    translation_y = torch.randint(-shift_y, shift_y + 1, size=[x.size(0), 1, 1], device=x.device)
+    grid_batch, grid_x, grid_y = torch.meshgrid(
+        torch.arange(x.size(0), dtype=torch.long, device=x.device),
+        torch.arange(x.size(2), dtype=torch.long, device=x.device),
+        torch.arange(x.size(3), dtype=torch.long, device=x.device),
+    )
+    grid_x = torch.clamp(grid_x + translation_x + 1, 0, x.size(2) + 1)
+    grid_y = torch.clamp(grid_y + translation_y + 1, 0, x.size(3) + 1)
+    x_pad = F.pad(x, [1, 1, 1, 1, 0, 0, 0, 0])
+    x = x_pad.permute(0, 2, 3, 1).contiguous()[grid_batch, grid_x, grid_y].permute(0, 3, 1, 2)
+    return x
+def rand_cutout(x, ratio=0.2):
+    cutout_size = int(x.size(2) * ratio + 0.5), int(x.size(3) * ratio + 0.5)
+    offset_x = torch.randint(0, x.size(2) + (1 - cutout_size[0] % 2), size=[x.size(0), 1, 1], device=x.device)
+    offset_y = torch.randint(0, x.size(3) + (1 - cutout_size[1] % 2), size=[x.size(0), 1, 1], device=x.device)
+    grid_batch, grid_x, grid_y = torch.meshgrid(
+        torch.arange(x.size(0), dtype=torch.long, device=x.device),
+        torch.arange(cutout_size[0], dtype=torch.long, device=x.device),
+        torch.arange(cutout_size[1], dtype=torch.long, device=x.device),
+    )
+    grid_x = torch.clamp(grid_x + offset_x - cutout_size[0] // 2, min=0, max=x.size(2) - 1)
+    grid_y = torch.clamp(grid_y + offset_y - cutout_size[1] // 2, min=0, max=x.size(3) - 1)
+    mask = torch.ones(x.size(0), x.size(2), x.size(3), dtype=x.dtype, device=x.device)
+    mask[grid_batch, grid_x, grid_y] = 0
+    x = x * mask.unsqueeze(1)
+    return x
+AUGMENT_FNS = {
+    'color': [rand_brightness, rand_saturation, rand_contrast],
+    'translation': [rand_translation],
+    'cutout': [rand_cutout],
+}
+class SingleDisc(nn.Module):
+    def __init__(self, nc=None, ndf=None, start_sz=256, end_sz=8, head=None, separable=False, patch=False):
+        super().__init__()
+        channel_dict = {4: 512, 8: 512, 16: 256, 32: 128, 64: 64, 128: 64,
+                        256: 32, 512: 16, 1024: 8}
+        # interpolate for start sz that are not powers of two
+        if start_sz not in channel_dict.keys():
+            sizes = np.array(list(channel_dict.keys()))
+            start_sz = sizes[np.argmin(abs(sizes - start_sz))]
+        self.start_sz = start_sz
+        # if given ndf, allocate all layers with the same ndf
+        if ndf is None:
+            nfc = channel_dict
+        else:
+            nfc = {k: ndf for k, v in channel_dict.items()}
+        # for feature map discriminators with nfc not in channel_dict
+        # this is the case for the pretrained backbone (midas.pretrained)
+        if nc is not None and head is None:
+            nfc[start_sz] = nc
+        layers = []
+        # Head if the initial input is the full modality
+        if head:
+            layers += [conv2d(nc, nfc[256], 3, 1, 1, bias=False),
+                       nn.LeakyReLU(0.2, inplace=True)]
+        # Down Blocks
+        DB = partial(DownBlockPatch, separable=separable) if patch else partial(DownBlock, separable=separable)
+        while start_sz > end_sz:
+            layers.append(DB(nfc[start_sz],  nfc[start_sz//2]))
+            start_sz = start_sz // 2
+        layers.append(conv2d(nfc[end_sz], 1, 4, 1, 0, bias=False))
+        self.main = nn.Sequential(*layers)
+    def forward(self, x, c):
+        return self.main(x)
+class SingleDiscCond(nn.Module):
+    def __init__(self, nc=None, ndf=None, start_sz=256, end_sz=8, head=None, separable=False, patch=False, c_dim=1000, cmap_dim=64, embedding_dim=128, cond_size=128):
+        super().__init__()
+        self.cmap_dim = cmap_dim
+        self.cond_attn = CondAttnBlock(cmap_dim, cond_size, dim_head=64, heads=8, norm_context=False, cosine_sim_attn=False)
+        # midas channels
+        channel_dict = {4: 512, 8: 512, 16: 256, 32: 128, 64: 64, 128: 64,
+                        256: 32, 512: 16, 1024: 8}
+        # interpolate for start sz that are not powers of two
+        if start_sz not in channel_dict.keys():
+            sizes = np.array(list(channel_dict.keys()))
+            start_sz = sizes[np.argmin(abs(sizes - start_sz))]
+        self.start_sz = start_sz
+        # if given ndf, allocate all layers with the same ndf
+        if ndf is None:
+            nfc = channel_dict
+        else:
+            nfc = {k: ndf for k, v in channel_dict.items()}
+        # for feature map discriminators with nfc not in channel_dict
+        # this is the case for the pretrained backbone (midas.pretrained)
+        if nc is not None and head is None:
+            nfc[start_sz] = nc
+        layers = []
+        # Head if the initial input is the full modality
+        if head:
+            layers += [conv2d(nc, nfc[256], 3, 1, 1, bias=False),
+                       nn.LeakyReLU(0.2, inplace=True)]
+        # Down Blocks
+        DB = partial(DownBlockPatch, separable=separable) if patch else partial(DownBlock, separable=separable)
+        while start_sz > end_sz:
+            layers.append(DB(nfc[start_sz],  nfc[start_sz//2]))
+            start_sz = start_sz // 2
+        self.main = nn.Sequential(*layers)
+        # additions for conditioning on class information
+        self.cls = conv2d(nfc[end_sz], self.cmap_dim, 4, 1, 0, bias=False)
+        #self.embed = nn.Embedding(num_embeddings=c_dim, embedding_dim=embedding_dim)
+        #self.embed_proj = nn.Sequential(
+        #    nn.Linear(self.embed.embedding_dim, self.cmap_dim),
+        #    nn.LeakyReLU(0.2, inplace=True),
+        #)
+    def forward(self, x, c):
+        h = self.main(x)
+        out = self.cls(h)
+        cond_pooled, cond, cond_mask = c
+        #print("COND", out.shape, cond.shape, cond_mask.shape, self.cond_sie)
+        cmap = self.cond_attn(out, cond, cond_mask)
+        # conditioning via projection
+        #cmap = self.embed_proj(self.embed(c)).unsqueeze(-1).unsqueeze(-1)
+        #cmap = 1
+        out = (out * cmap).sum(dim=1, keepdim=True) * (1 / np.sqrt(self.cmap_dim))
+        return out
+class MultiScaleD(nn.Module):
+    def __init__(
+        self,
+        channels,
+        resolutions,
+        num_discs=1,
+        proj_type=2,  # 0 = no projection, 1 = cross channel mixing, 2 = cross scale mixing
+        cond=1,
+        separable=False,
+        patch=False,
+        cond_size=128,
+        **kwargs,
+    ):
+        super().__init__()
+        assert num_discs in [1, 2, 3, 4]
+        # the first disc is on the lowest level of the backbone
+        self.disc_in_channels = channels[:num_discs]
+        self.disc_in_res = resolutions[:num_discs]
+        Disc = SingleDiscCond if cond else SingleDisc
+        mini_discs = []
+        for i, (cin, res) in enumerate(zip(self.disc_in_channels, self.disc_in_res)):
+            start_sz = res if not patch else 16
+            mini_discs += [str(i), Disc(nc=cin, start_sz=start_sz, end_sz=8, separable=separable, patch=patch, cond_size=cond_size)],
+        self.mini_discs = nn.ModuleDict(mini_discs)
+    def forward(self, features, c):
+        all_logits = []
+        for k, disc in self.mini_discs.items():
+            all_logits.append(disc(features[k], c).view(features[k].size(0), -1))
+        all_logits = torch.cat(all_logits, dim=1)
+        return all_logits
+class ProjectedDiscriminator(torch.nn.Module):
+    def __init__(
+        self,
+        diffaug=False,
+        interp224=False,
+        t_emb_dim = 128,
+        out_dim=64,
+        backbone_kwargs={},
+        act=torch.nn.LeakyReLU(0.2),
+        num_discs=1,
+        **kwargs
+    ):
+        super().__init__()
+        self.diffaug = diffaug
+        self.act = act
+        self.interp224 = interp224
+        self.num_discs = num_discs
+        self.feature_network = F_RandomProj(**backbone_kwargs)
+        self.discriminator = MultiScaleD(
+            channels=[c*2+out_dim for c in self.feature_network.CHANNELS],
+            resolutions=self.feature_network.RESOLUTIONS,
+            **backbone_kwargs,
+        )
+        self.t_embed = torch.nn.ModuleList([TimestepEmbedding(
+            embedding_dim=t_emb_dim,
+            hidden_dim=t_emb_dim,
+            output_dim=out_dim,
+            act=act,
+        ) for _ in range(num_discs)])
+    def train(self, mode=True):
+        self.feature_network = self.feature_network.train(False)
+        self.discriminator = self.discriminator.train(mode)
+        return self
+    def eval(self):
+        return self.train(False)
+    def forward(self, x, t, xprev, cond=None):
+        #t_embed = self.t_embed(t)
+        #t_embed = self.act(t_embed)
+        if self.diffaug:
+            x = DiffAugment(x, policy='color,translation,cutout')
+        if self.interp224:
+            x = F.interpolate(x, 256, mode='bilinear', align_corners=False)
+        features1 = self.feature_network(x)
+        features2 = self.feature_network(xprev)
+        features = {}
+        for k in features1.keys():
+            if int(k) >= self.num_discs:
+                continue
+            tcat = self.t_embed[int(k)](t)
+            #print(tcat.shape)
+            h, w = features1[k].shape[2:]
+            tcat = tcat.view(tcat.shape[0], tcat.shape[1], 1, 1).repeat(1,1, h, w)
+            #print(x.shape, xprev.shape, features1[k].shape, features2[k].shape, tcat.shape)
+            features[k] = torch.cat((features1[k], features2[k], tcat), dim=1)
+            #print(features[k].shape)
+        logits = self.discriminator(features, cond)
+        return logits

scripts/init.sh CHANGED Viewed

@@ -1,2 +1,14 @@
-source /p/project/laionize/miniconda/bin/activate
-conda activate ddgan

+ml purge
+ml use $OTHERSTAGES
+ml Stages/2022
+ml GCC/11.2.0
+ml OpenMPI/4.1.2
+ml CUDA/11.5
+ml cuDNN/8.3.1.22-CUDA-11.5
+ml NCCL/2.12.7-1-CUDA-11.5
+ml PyTorch/1.11-CUDA-11.5
+ml Horovod/0.24
+ml torchvision/0.12.0
+source /p/home/jusers/cherti1/jureca/ccstdl/code/feed_forward_vqgan_clip/envs/jureca_2022/bin/activate
+export HOROVOD_CACHE_CAPACITY=4096
+export CUDA_VISIBLE_DEVICES=0,1,2,3

scripts/run_jurecadc_conda.sh ADDED Viewed

	@@ -0,0 +1,23 @@

+#!/bin/bash -x
+#SBATCH --account=zam
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=4
+#SBATCH --cpus-per-task=24
+#SBATCH --time=06:00:00
+#SBATCH --gres=gpu:4
+#SBATCH --partition=dc-gpu
+ml CUDA
+source /p/project/laionize/miniconda/bin/activate
+conda activate ddgan
+#source scripts/init_2022.sh
+#source scripts/init_2020.sh
+#source scripts/init.sh
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+echo "Job id: $SLURM_JOB_ID"
+export TOKENIZERS_PARALLELISM=false
+#export NCCL_ASYNC_ERROR_HANDLING=1
+export NCCL_IB_TIMEOUT=50
+export UCX_RC_TIMEOUT=4s
+export NCCL_IB_RETRY_CNT=10
+export TORCH_DISTRIBUTED_DEBUG=INFO
+srun python -u $*

scripts/run_juwelsbooster_conda.sh ADDED Viewed

	@@ -0,0 +1,19 @@

+#!/bin/bash -x
+#SBATCH --account=laionize
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=4
+#SBATCH --cpus-per-task=24
+#SBATCH --time=06:00:00
+#SBATCH --gres=gpu:4
+#SBATCH --partition=booster
+ml CUDA
+source /p/project/laionize/miniconda/bin/activate
+conda activate ddgan
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+echo "Job id: $SLURM_JOB_ID"
+export TOKENIZERS_PARALLELISM=false
+#export NCCL_ASYNC_ERROR_HANDLING=1
+export NCCL_IB_TIMEOUT=50
+export UCX_RC_TIMEOUT=4s
+export NCCL_IB_RETRY_CNT=10
+srun python -u $*

test.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from score_sde.models.projected_discriminator import ProjectedDiscriminator
+import torch
+discr = ProjectedDiscriminator(num_discs=4, backbone_kwargs={"cond_size": 768})
+x = torch.randn(1,3,224,224)
+t = torch.randint(0, 1, size=(1,))
+cond = (None, torch.randn(1,77, 768), torch.ones(1,77, dtype=torch.bool))
+y = discr(x, t, x, cond=cond)
+print(y.shape)

test_ddgan.py CHANGED Viewed

@@ -384,15 +384,20 @@ def sample_and_test(args):
     for epoch in epochs:
         args.epoch_id = epoch
         path = './saved_info/dd_gan/{}/{}/netG_{}.pth'.format(args.dataset, args.exp, args.epoch_id)
-        next_path = './saved_info/dd_gan/{}/{}/netG_{}.pth'.format(args.dataset, args.exp, args.epoch_id+1)
         if not os.path.exists(path):
             continue
         print(path)
         #if not os.path.exists(next_path):
         #    print(f"STOP at {epoch}")
         #    break
-        ckpt = torch.load(path, map_location=device)
         suffix = '_' + args.eval_name if args.eval_name else ""
         dest = './saved_info/dd_gan/{}/{}/eval_{}{}.json'.format(args.dataset, args.exp, args.epoch_id, suffix)
         next_dest = './saved_info/dd_gan/{}/{}/eval_{}{}.json'.format(args.dataset, args.exp, args.epoch_id+1, suffix)

     for epoch in epochs:
         args.epoch_id = epoch
         path = './saved_info/dd_gan/{}/{}/netG_{}.pth'.format(args.dataset, args.exp, args.epoch_id)
+        next_next_path = './saved_info/dd_gan/{}/{}/netG_{}.pth'.format(args.dataset, args.exp, args.epoch_id+2)
         if not os.path.exists(path):
             continue
+        if not os.path.exists(next_next_path):
+            break
         print(path)
         #if not os.path.exists(next_path):
         #    print(f"STOP at {epoch}")
         #    break
+        try:
+            ckpt = torch.load(path, map_location=device)
+        except Exception:
+            continue
         suffix = '_' + args.eval_name if args.eval_name else ""
         dest = './saved_info/dd_gan/{}/{}/eval_{}{}.json'.format(args.dataset, args.exp, args.epoch_id, suffix)
         next_dest = './saved_info/dd_gan/{}/{}/eval_{}{}.json'.format(args.dataset, args.exp, args.epoch_id+1, suffix)

train_ddgan.py CHANGED Viewed

@@ -210,6 +210,7 @@ def get_autocast(precision):
 def train(rank, gpu, args):
     from score_sde.models.discriminator import Discriminator_small, Discriminator_large, CondAttnDiscriminator, SmallCondAttnDiscriminator
     from score_sde.models.ncsnpp_generator_adagn import NCSNpp
     from EMA import EMA
@@ -281,6 +282,12 @@ def train(rank, gpu, args):
                     transforms.ToTensor(),
                     transforms.Normalize((0.5,0.5,0.5), (0.5,0.5,0.5))
             ])
         shards = glob(os.path.join(args.dataset_root, "*.tar")) if os.path.isdir(args.dataset_root)  else args.dataset_root
         pipeline = [ResampledShards2(shards)]
         pipeline.extend([
@@ -295,7 +302,7 @@ def train(rank, gpu, args):
         pipeline.extend([
             wds.select(filter_no_caption),
             wds.decode("pilrgb", handler=log_and_continue),
-            wds.rename(image="jpg;png"),
             wds.map_dict(image=train_transform),
             wds.to_tuple("image","txt"),
             wds.batched(batch_size, partial=False),
@@ -361,7 +368,13 @@ def train(rank, gpu, args):
             t_emb_dim = args.t_emb_dim,
             cond_size=text_encoder.output_size,
             act=nn.LeakyReLU(0.2)).to(device)
     broadcast_params(netG.parameters())
     broadcast_params(netD.parameters())
@@ -387,7 +400,10 @@ def train(rank, gpu, args):
         netD = nn.parallel.DistributedDataParallel(netD, device_ids=[gpu])
     else:
         netG = nn.parallel.DistributedDataParallel(netG, device_ids=[gpu])
-        netD = nn.parallel.DistributedDataParallel(netD, device_ids=[gpu])
     if args.grad_checkpointing:
         from fairscale.nn.checkpoint.checkpoint_activations import checkpoint_wrapper
@@ -430,7 +446,7 @@ def train(rank, gpu, args):
                   .format(checkpoint['epoch']))
     else:
         global_step, epoch, init_epoch = 0, 0, 0
-    use_cond_attn_discr = args.discr_type in ("large_cond_attn", "small_cond_attn", "large_attn_pool")
     for epoch in range(init_epoch, args.num_epoch+1):
         if args.dataset == "wds":
             os.environ["WDS_EPOCH"] = str(epoch)

 def train(rank, gpu, args):
     from score_sde.models.discriminator import Discriminator_small, Discriminator_large, CondAttnDiscriminator, SmallCondAttnDiscriminator
+    from score_sde.models.projected_discriminator import ProjectedDiscriminator
     from score_sde.models.ncsnpp_generator_adagn import NCSNpp
     from EMA import EMA
                     transforms.ToTensor(),
                     transforms.Normalize((0.5,0.5,0.5), (0.5,0.5,0.5))
             ])
+        elif args.preprocessing == "simple_random_crop":
+            train_transform = transforms.Compose([
+                    transforms.RandomCrop(args.image_size, interpolation=3),
+                    transforms.ToTensor(),
+                    transforms.Normalize((0.5,0.5,0.5), (0.5,0.5,0.5))
+            ])
         shards = glob(os.path.join(args.dataset_root, "*.tar")) if os.path.isdir(args.dataset_root)  else args.dataset_root
         pipeline = [ResampledShards2(shards)]
         pipeline.extend([
         pipeline.extend([
             wds.select(filter_no_caption),
             wds.decode("pilrgb", handler=log_and_continue),
+            wds.rename(image="jpg;png;webp"),
             wds.map_dict(image=train_transform),
             wds.to_tuple("image","txt"),
             wds.batched(batch_size, partial=False),
             t_emb_dim = args.t_emb_dim,
             cond_size=text_encoder.output_size,
             act=nn.LeakyReLU(0.2)).to(device)
+    elif args.discr_type == "projected_gan":
+        netD = ProjectedDiscriminator(
+            num_discs=4,
+            backbone_kwargs={"cond_size": text_encoder.output_size}
+        )
+        netD = netD.to(device)
     broadcast_params(netG.parameters())
     broadcast_params(netD.parameters())
         netD = nn.parallel.DistributedDataParallel(netD, device_ids=[gpu])
     else:
         netG = nn.parallel.DistributedDataParallel(netG, device_ids=[gpu])
+        netD = nn.parallel.DistributedDataParallel(netD, device_ids=[gpu], find_unused_parameters=args.discr_type=="projected_gan")
+        #if args.discr_type == "projected_gan":
+        #    netD._set_static_graph()
     if args.grad_checkpointing:
         from fairscale.nn.checkpoint.checkpoint_activations import checkpoint_wrapper
                   .format(checkpoint['epoch']))
     else:
         global_step, epoch, init_epoch = 0, 0, 0
+    use_cond_attn_discr = args.discr_type in ("large_cond_attn", "small_cond_attn", "large_attn_pool", "projected_gan")
     for epoch in range(init_epoch, args.num_epoch+1):
         if args.dataset == "wds":
             os.environ["WDS_EPOCH"] = str(epoch)