Spaces:

mehdidc
/

text_to_image_ddgan

Runtime error

App Files Files Community

Mehdi Cherti commited on Apr 15, 2023

Commit

8ab4de9

1 Parent(s): 3dcdf92

add basic cross attention + global attention block

Browse files

Files changed (4) hide show

score_sde/models/layers.py +1 -1
score_sde/models/layerspp.py +28 -0
score_sde/models/ncsnpp_generator_adagn.py +42 -4
train_ddgan.py +39 -25

score_sde/models/layers.py CHANGED Viewed

@@ -583,7 +583,7 @@ class Identity(nn.Module):
     def forward(self, x, *args, **kwargs):
         return x
 class CrossAttention(nn.Module):
     def __init__(
         self,

     def forward(self, x, *args, **kwargs):
         return x
 class CrossAttention(nn.Module):
     def __init__(
         self,

score_sde/models/layerspp.py CHANGED Viewed

@@ -123,6 +123,34 @@ class AttnBlockpp(nn.Module):
     else:
       return (x + h) / np.sqrt(2.)
 class Upsample(nn.Module):
   def __init__(self, in_ch=None, out_ch=None, with_conv=False, fir=False,

     else:
       return (x + h) / np.sqrt(2.)
+class AttnBlockppRaw(nn.Module):
+  """Channel-wise self-attention block. Modified from DDPM."""
+  def __init__(self, channels, skip_rescale=False, init_scale=0.):
+    super().__init__()
+    self.GroupNorm_0 = nn.GroupNorm(num_groups=min(channels // 4, 32), num_channels=channels,
+                                  eps=1e-6)
+    self.NIN_0 = NIN(channels, channels)
+    self.NIN_1 = NIN(channels, channels)
+    self.NIN_2 = NIN(channels, channels)
+    self.NIN_3 = NIN(channels, channels, init_scale=init_scale)
+    self.skip_rescale = skip_rescale
+  def forward(self, x):
+    B, C, H, W = x.shape
+    h = self.GroupNorm_0(x)
+    q = self.NIN_0(h)
+    k = self.NIN_1(h)
+    v = self.NIN_2(h)
+    w = torch.einsum('bchw,bcij->bhwij', q, k) * (int(C) ** (-0.5))
+    w = torch.reshape(w, (B, H, W, H * W))
+    w = F.softmax(w, dim=-1)
+    w = torch.reshape(w, (B, H, W, H, W))
+    h = torch.einsum('bhwij,bcij->bchw', w, v)
+    h = self.NIN_3(h)
+    return h
 class Upsample(nn.Module):
   def __init__(self, in_ch=None, out_ch=None, with_conv=False, fir=False,

score_sde/models/ncsnpp_generator_adagn.py CHANGED Viewed

@@ -53,6 +53,36 @@ get_act = layers.get_act
 default_initializer = layers.default_init
 dense = dense_layer.dense
 class PixelNorm(nn.Module):
     def __init__(self):
         super().__init__()
@@ -68,6 +98,7 @@ class NCSNpp(nn.Module):
   def __init__(self, config):
     super().__init__()
     self.config = config
     self.grad_checkpointing = config.grad_checkpointing if hasattr(config, "grad_checkpointing") else False
     self.not_use_tanh = config.not_use_tanh
     self.act = act = nn.SiLU()
@@ -124,7 +155,14 @@ class NCSNpp(nn.Module):
       modules[-1].weight.data = default_initializer()(modules[-1].weight.shape)
       nn.init.zeros_(modules[-1].bias)
     if config.cross_attention:
-      AttnBlock = functools.partial(layers.CondAttnBlock, context_dim=config.cond_size)
     else:
       AttnBlock = functools.partial(layerspp.AttnBlockpp,
                                     init_scale=init_scale,
@@ -342,7 +380,7 @@ class NCSNpp(nn.Module):
         h = modules[m_idx](hs[-1], temb, zemb)
         m_idx += 1
         if h.shape[-1] in self.attn_resolutions:
-          if type(modules[m_idx]) == layers.CondAttnBlock:
             h = modules[m_idx](h, cond, cond_mask)
           else:
             h = modules[m_idx](h)
@@ -377,7 +415,7 @@ class NCSNpp(nn.Module):
     h = hs[-1]
     h = modules[m_idx](h, temb, zemb)
     m_idx += 1
-    if type(modules[m_idx]) == layers.CondAttnBlock:
       h = modules[m_idx](h, cond, cond_mask)
     else:
       h = modules[m_idx](h)
@@ -394,7 +432,7 @@ class NCSNpp(nn.Module):
         m_idx += 1
       if h.shape[-1] in self.attn_resolutions:
-        if type(modules[m_idx]) == layers.CondAttnBlock:
           h = modules[m_idx](h, cond, cond_mask)
         else:
           h = modules[m_idx](h)

 default_initializer = layers.default_init
 dense = dense_layer.dense
+class CrossAndGlobalAttnBlock(nn.Module):
+  """Channel-wise self-attention block."""
+  def __init__(self, channels, *, context_dim=None, dim_head=64, heads=8, norm_context=False, cosine_sim_attn=False):
+    super().__init__()
+    self.GroupNorm_0 = nn.GroupNorm(num_groups=32, num_channels=channels, eps=1e-6)
+    self.ca = layers.CrossAttention(
+      channels,
+      context_dim=context_dim,
+      dim_head=dim_head,
+      heads=heads,
+      norm_context=norm_context,
+      cosine_sim_attn=cosine_sim_attn,
+    )
+    self.attn = layerspp.AttnBlockppRaw(channels)
+  def forward(self, x, cond, mask=None):
+    B, C, H, W = x.shape
+    h = self.GroupNorm_0(x)
+    h = h.view(B, C, H*W)
+    h = h.permute(0,2,1)
+    h = h.contiguous()
+    h_new = self.ca(h, cond, mask=mask)
+    h_new = h_new.permute(0,2,1)
+    h_new = h_new.contiguous()
+    h_new = h_new.view(B, C, H, W)
+    h_global = self.attn(x)
+    h = h_new + h_global
+    return x + h
 class PixelNorm(nn.Module):
     def __init__(self):
         super().__init__()
   def __init__(self, config):
     super().__init__()
     self.config = config
+    self.cross_attention_block = config.cross_attention_block
     self.grad_checkpointing = config.grad_checkpointing if hasattr(config, "grad_checkpointing") else False
     self.not_use_tanh = config.not_use_tanh
     self.act = act = nn.SiLU()
       modules[-1].weight.data = default_initializer()(modules[-1].weight.shape)
       nn.init.zeros_(modules[-1].bias)
     if config.cross_attention:
+      #block_name = config.cross_attention_block if hasattr(config, "cross_attention_block") else "basic"
+      block_name = config.cross_attention_block
+      if block_name == "basic":
+        AttnBlock = functools.partial(layers.CondAttnBlock, context_dim=config.cond_size)
+      elif block_name == "cross_and_global_attention":
+        AttnBlock = functools.partial(CrossAndGlobalAttnBlock, context_dim=config.cond_size)
+      print(AttnBlock)
     else:
       AttnBlock = functools.partial(layerspp.AttnBlockpp,
                                     init_scale=init_scale,
         h = modules[m_idx](hs[-1], temb, zemb)
         m_idx += 1
         if h.shape[-1] in self.attn_resolutions:
+          if type(modules[m_idx]) in (layers.CondAttnBlock, CrossAndGlobalAttnBlock):
             h = modules[m_idx](h, cond, cond_mask)
           else:
             h = modules[m_idx](h)
     h = hs[-1]
     h = modules[m_idx](h, temb, zemb)
     m_idx += 1
+    if type(modules[m_idx]) in (layers.CondAttnBlock, CrossAndGlobalAttnBlock):
       h = modules[m_idx](h, cond, cond_mask)
     else:
       h = modules[m_idx](h)
         m_idx += 1
       if h.shape[-1] in self.attn_resolutions:
+        if type(modules[m_idx]) in (layers.CondAttnBlock, CrossAndGlobalAttnBlock):
           h = modules[m_idx](h, cond, cond_mask)
         else:
           h = modules[m_idx](h)

train_ddgan.py CHANGED Viewed

@@ -385,9 +385,10 @@ def train(rank, gpu, args):
             backbone_kwargs={"cond_size": text_encoder.output_size}
         )
         netD = netD.to(device)
-    broadcast_params(netG.parameters())
-    broadcast_params(netD.parameters())
     if args.fsdp:
         from fairscale.nn.checkpoint.checkpoint_activations import checkpoint_wrapper
@@ -410,8 +411,9 @@ def train(rank, gpu, args):
     if args.fsdp:
         netD = nn.parallel.DistributedDataParallel(netD, device_ids=[gpu])
     else:
-        netG = nn.parallel.DistributedDataParallel(netG, device_ids=[gpu])
-        netD = nn.parallel.DistributedDataParallel(netD, device_ids=[gpu], find_unused_parameters=args.discr_type=="projected_gan")
         #if args.discr_type == "projected_gan":
         #    netD._set_static_graph()
@@ -652,7 +654,8 @@ def train(rank, gpu, args):
                     torchvision.utils.save_image(fake_sample, os.path.join(exp_path, 'sample_discrete_epoch_{}_iteration_{}.png'.format(epoch, iteration)), normalize=True)
                 if args.save_content:
-                    dist.barrier()
                     if rank == 0:
                         print('Saving content.')
                     def to_cpu(d):
@@ -709,20 +712,26 @@ def init_processes(rank, size, fn, args):
     """ Initialize the distributed environment. """
     import os
-    args.rank = int(os.environ['SLURM_PROCID'])
-    args.world_size =  int(os.getenv("SLURM_NTASKS"))
-    args.local_rank = int(os.environ['SLURM_LOCALID'])
-    print(args.rank, args.world_size)
-    args.master_address = os.getenv("SLURM_LAUNCH_NODE_IPADDR")
-    os.environ['MASTER_ADDR'] = args.master_address
-    os.environ['MASTER_PORT'] = "12345"
-    torch.cuda.set_device(args.local_rank)
-    gpu = args.local_rank
-    dist.init_process_group(backend='nccl', init_method='env://', rank=rank, world_size=args.world_size)
-    fn(rank, gpu, args)
-    dist.barrier()
-    cleanup()
 def cleanup():
     dist.destroy_process_group()
@@ -737,6 +746,8 @@ if __name__ == '__main__':
     parser.add_argument('--mismatch_loss', action='store_true',default=False, help="use mismatch loss")
     parser.add_argument('--text_encoder', type=str, default="google/t5-v1_1-base")
     parser.add_argument('--cross_attention', action='store_true',default=False, help="use cross attention in generator")
     parser.add_argument('--fsdp', action='store_true',default=False, help='use FSDP')
     parser.add_argument('--grad_checkpointing', action='store_true',default=False, help='use grad checkpointing')
@@ -809,7 +820,7 @@ if __name__ == '__main__':
     parser.add_argument('--beta2', type=float, default=0.9,
                             help='beta2 for adam')
     parser.add_argument('--no_lr_decay',action='store_true', default=False)
-    parser.add_argument('--grad_penalty_cond', action='store_true',default=False, help="cond based grad penalty")
     parser.add_argument('--use_ema', action='store_true', default=False,
                             help='use EMA or not')
@@ -828,6 +839,7 @@ if __name__ == '__main__':
     parser.add_argument('--precision', type=str, default="fp32")
     ###ddp
     parser.add_argument('--num_proc_node', type=int, default=1,
                         help='The number of nodes in multi node env.')
     parser.add_argument('--num_process_per_node', type=int, default=1,
@@ -840,8 +852,10 @@ if __name__ == '__main__':
                         help='address for master')
     args = parser.parse_args()
-    # args.world_size = args.num_proc_node * args.num_process_per_node
-    args.world_size =  int(os.getenv("SLURM_NTASKS"))
-    args.rank = int(os.environ['SLURM_PROCID'])
-    # size = args.num_process_per_node
     init_processes(args.rank, args.world_size, train, args)

             backbone_kwargs={"cond_size": text_encoder.output_size}
         )
         netD = netD.to(device)
+    if args.world_size > 1:
+        broadcast_params(netG.parameters())
+        broadcast_params(netD.parameters())
     if args.fsdp:
         from fairscale.nn.checkpoint.checkpoint_activations import checkpoint_wrapper
     if args.fsdp:
         netD = nn.parallel.DistributedDataParallel(netD, device_ids=[gpu])
     else:
+        if args.world_size > 1:
+            netG = nn.parallel.DistributedDataParallel(netG, device_ids=[gpu])
+            netD = nn.parallel.DistributedDataParallel(netD, device_ids=[gpu], find_unused_parameters=args.discr_type=="projected_gan")
         #if args.discr_type == "projected_gan":
         #    netD._set_static_graph()
                     torchvision.utils.save_image(fake_sample, os.path.join(exp_path, 'sample_discrete_epoch_{}_iteration_{}.png'.format(epoch, iteration)), normalize=True)
                 if args.save_content:
+                    if args.world_size > 1:
+                        dist.barrier()
                     if rank == 0:
                         print('Saving content.')
                     def to_cpu(d):
     """ Initialize the distributed environment. """
     import os
+    if size == 1:
+        args.rank = 0
+        args.world_size = 1
+        args.local_rank = 0
+        fn(rank,args.local_rank, args)
+    else:
+        args.rank = int(os.environ['SLURM_PROCID'])
+        args.world_size =  int(os.getenv("SLURM_NTASKS"))
+        args.local_rank = int(os.environ['SLURM_LOCALID'])
+        print(args.rank, args.world_size)
+        args.master_address = os.getenv("SLURM_LAUNCH_NODE_IPADDR")
+        os.environ['MASTER_ADDR'] = args.master_address
+        os.environ['MASTER_PORT'] = "12345"
+        torch.cuda.set_device(args.local_rank)
+        gpu = args.local_rank
+        dist.init_process_group(backend='nccl', init_method='env://', rank=rank, world_size=args.world_size)
+        fn(rank, gpu, args)
+        dist.barrier()
+        cleanup()
 def cleanup():
     dist.destroy_process_group()
     parser.add_argument('--mismatch_loss', action='store_true',default=False, help="use mismatch loss")
     parser.add_argument('--text_encoder', type=str, default="google/t5-v1_1-base")
     parser.add_argument('--cross_attention', action='store_true',default=False, help="use cross attention in generator")
+    parser.add_argument('--cross_attention_block',  default="basic", help="cross attention block type")
     parser.add_argument('--fsdp', action='store_true',default=False, help='use FSDP')
     parser.add_argument('--grad_checkpointing', action='store_true',default=False, help='use grad checkpointing')
     parser.add_argument('--beta2', type=float, default=0.9,
                             help='beta2 for adam')
     parser.add_argument('--no_lr_decay',action='store_true', default=False)
+    parser.add_argument('--grad_penalty_cond', action='store_true',default=False, help="cond based grad")
     parser.add_argument('--use_ema', action='store_true', default=False,
                             help='use EMA or not')
     parser.add_argument('--precision', type=str, default="fp32")
     ###ddp
     parser.add_argument('--num_proc_node', type=int, default=1,
                         help='The number of nodes in multi node env.')
     parser.add_argument('--num_process_per_node', type=int, default=1,
                         help='address for master')
     args = parser.parse_args()
+    if 'SLURM_NTASKS' in os.environ:
+        args.world_size =  int(os.getenv("SLURM_NTASKS"))
+        args.rank = int(os.environ['SLURM_PROCID'])
+    else:
+        args.world_size = 1
+        args.rank = 0
     init_processes(args.rank, args.world_size, train, args)