Spaces:

wenpeng
/

Sod_Inpaint

Runtime error

App Files Files Community

wenpeng commited on May 7, 2022

Commit

c7813b3

1 Parent(s): 9f59d48

update .gitignore

Browse files

Files changed (29) hide show

.gitattributes +1 -0
.gitignore +5 -0
app.py +23 -0
download.sh +11 -0
examples/SOD001.jpg +0 -0
examples/SOD003.jpeg +0 -0
examples/SOD013.jpg +0 -0
examples/SOD015.jpg +0 -0
inpaint/configs/prediction/default.yaml +14 -0
inpaint/infer_model.py +94 -0
inpaint/predict.py +96 -0
inpaint/saicinpainting/training/modules/__init__.py +7 -0
inpaint/saicinpainting/training/modules/base.py +80 -0
inpaint/saicinpainting/training/modules/depthwise_sep_conv.py +17 -0
inpaint/saicinpainting/training/modules/fake_fakes.py +47 -0
inpaint/saicinpainting/training/modules/ffc.py +367 -0
inpaint/saicinpainting/training/modules/multidilated_conv.py +98 -0
inpaint/saicinpainting/training/modules/multiscale.py +244 -0
inpaint/saicinpainting/training/modules/pix2pixhd.py +669 -0
inpaint/saicinpainting/training/modules/spatial_transform.py +49 -0
inpaint/saicinpainting/training/modules/squeeze_excitation.py +20 -0
inpaint/saicinpainting/training/trainers/__init__.py +26 -0
inpaint/saicinpainting/training/trainers/base.py +19 -0
inpaint/saicinpainting/training/trainers/default.py +53 -0
sod/PGNet.py +270 -0
sod/Res.py +363 -0
sod/Swin.py +578 -0
sod/configs/prediction/default.yaml +14 -0
sod/infer_model.py +89 -0

.gitattributes CHANGED Viewed

@@ -25,3 +25,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zstandard filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zstandard filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,5 @@

+inpaint/weights
+sod/weights
+**/__pycache__
+flagged
+**/*.zip

app.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from doctest import Example
+import gradio as gr
+import inpaint.infer_model as inpaint
+import sod.infer_model as sod
+import numpy as np
+import torch
+import os
+# cmd = 'sh download.sh'
+# os.system(cmd)
+device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
+inpaint_model = inpaint.IVModel(device=device)
+sod_model = sod.IVModel(device=torch.device("cpu"))
+def sod_inpaint(img):
+    img = img[:,:,::-1]
+    res = sod_model.forward(img,None)
+    res = np.uint8(res)
+    res = inpaint_model.forward(res,None)
+    res = np.uint8(res)
+    return res[:,:,::-1]
+iface = gr.Interface(fn=sod_inpaint, inputs="image", outputs="image", examples='examples', title='显著物体消除', description='这是一个图像API，功能是自动把画面中的显著物体消除', theme='huggingface')
+iface.launch(server_name='0.0.0.0', share=False)

download.sh ADDED Viewed

	@@ -0,0 +1,11 @@

+FILE_ID=1udSLeuWAZf2-uI7SI8dFEfBuyvLSpB9V
+checkpoint_path='inpaint/weights.zip'
+wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate "https://docs.google.com/uc?export=download&id=${FILE_ID}" -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/
+/p')&id=${FILE_ID}" -O ${checkpoint_path} && rm -rf /tmp/cookies.txt
+unzip $checkpoint_path
+FILE_ID=1qI8-HBTz2nNSTyD9iB4YMn067P7XMKuD
+checkpoint_path='sod/weights.zip'
+wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate "https://docs.google.com/uc?export=download&id=${FILE_ID}" -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/
+/p')&id=${FILE_ID}" -O ${checkpoint_path} && rm -rf /tmp/cookies.txt
+unzip $checkpoint_path

examples/SOD001.jpg ADDED Viewed

examples/SOD003.jpeg ADDED Viewed

examples/SOD013.jpg ADDED Viewed

examples/SOD015.jpg ADDED Viewed

inpaint/configs/prediction/default.yaml ADDED Viewed

	@@ -0,0 +1,14 @@

+indir: no  # to be overriden in CLI
+outdir: no  # to be overriden in CLI
+model:
+  path: no  # to be overriden in CLI
+  checkpoint: best.ckpt
+dataset:
+  kind: default
+  img_suffix: .png
+  pad_out_to_modulo: 8
+device: cuda
+out_key: inpainted

inpaint/infer_model.py ADDED Viewed

	@@ -0,0 +1,94 @@

+import os
+import cv2
+import numpy as np
+import torch
+import torch.nn.functional as F
+import os
+import sys
+sys.path.append('inpaint')
+os.environ['OMP_NUM_THREADS'] = '1'
+os.environ['OPENBLAS_NUM_THREADS'] = '1'
+os.environ['MKL_NUM_THREADS'] = '1'
+os.environ['VECLIB_MAXIMUM_THREADS'] = '1'
+os.environ['NUMEXPR_NUM_THREADS'] = '1'
+import cv2
+import numpy as np
+import torch
+import yaml
+from omegaconf import OmegaConf
+from saicinpainting.training.trainers import load_checkpoint
+class IVModel():
+    def __init__(self, device=torch.device('cuda:0')):
+        super(IVModel, self).__init__()
+        self.device = device
+        conf_path = 'inpaint/configs/prediction/default.yaml'
+        predict_config = OmegaConf.load(conf_path)
+        predict_config.model.path='inpaint/weights/big-lama'
+        if not os.path.exists(conf_path):
+            print('未找到配置文件！')
+        train_config_path = os.path.join(predict_config.model.path, 'config.yaml')
+        with open(train_config_path, 'r') as f:
+            train_config = OmegaConf.create(yaml.safe_load(f))
+        train_config.training_model.predict_only = True
+        train_config.visualizer.kind = 'noop'
+        checkpoint_path = os.path.join(predict_config.model.path, 'models', predict_config.model.checkpoint)
+        self.model = load_checkpoint(train_config, checkpoint_path, strict=False, map_location='cpu')
+        self.model.freeze()
+        self.model.to(device)
+        self.__first_forward__()
+    def __first_forward__(self, input_size=(2048, 4096, 3)):
+        # 调用forward()严格控制最大显存
+        print('initialize Inpaint Model...')
+        _ = self.forward(np.random.rand(*input_size) * 255, None)
+        print('initialize Complete!')
+    def __resize_tensor__(self, image, max_size=1024, scale_factor=8):
+        h, w = image.size()[2:]
+        if max(h, w) > max_size:
+            if h < w:
+                h, w = int(max_size * h / w), max_size
+            else:
+                h, w = max_size, int(max_size * w / h)
+        h = h // scale_factor * scale_factor
+        w = w // scale_factor * scale_factor
+        image = F.interpolate(image, (h, w), mode='bicubic')
+        return image
+    def input_preprocess_tensor(self, img):
+        img_t = torch.from_numpy(img.astype(np.float32))  # .to(self.device)
+        img_t = img_t.permute(2, 0, 1).unsqueeze(0)
+        img_t = img_t / 255.
+        img_t_for_net = self.__resize_tensor__(img_t).to(self.device)  # 为了控制最大显存容量
+        img_t_for_out = self.__resize_tensor__(img_t, max_size=2048).to(self.device)  # 为了控制最大显存容量
+        return img_t_for_net, img_t_for_out
+    def forward(self, img, json_data):
+        _,w,_ = img.shape
+        mask = img[:,w//2:,0]
+        kernel = np.ones((4, 4), np.uint8)
+        mask = cv2.dilate(mask, kernel, iterations=5)[:,:,np.newaxis]
+        img = img[:,:w//2,::-1]
+        # print(img.shape,mask.shape)
+        img_t_for_net, img_t_for_out = self.input_preprocess_tensor(img)
+        mask_t_for_net, mask_t_for_out = self.input_preprocess_tensor(mask)
+        h, w = img_t_for_out.shape[2:]
+        # print(img_t.shape,mask_t.shape)
+        mask_t_for_out = (mask_t_for_out>0).int()
+        mask_t_for_net = (mask_t_for_net>0).int()
+        with torch.no_grad():
+            res_t = self.model(dict(image=img_t_for_net, mask=mask_t_for_net))['inpainted']
+            res_t = F.interpolate(res_t, (h, w), mode='bicubic')
+            res_t = img_t_for_out * (1 - mask_t_for_out) + res_t * mask_t_for_out
+        res_t = torch.clip(res_t * 255, min=0, max=255)
+        res = res_t.squeeze(0).permute(1, 2, 0).detach().cpu().numpy()
+        res = res[:, :, (2, 1, 0)].astype(np.uint8)
+        return res

inpaint/predict.py ADDED Viewed

	@@ -0,0 +1,96 @@

+#!/usr/bin/env python3
+# Example command:
+# ./bin/predict.py \
+#       model.path=<path to checkpoint, prepared by make_checkpoint.py> \
+#       indir=<path to input data> \
+#       outdir=<where to store predicts>
+import logging
+import os
+import sys
+import traceback
+from saicinpainting.evaluation.utils import move_to_device
+os.environ['OMP_NUM_THREADS'] = '1'
+os.environ['OPENBLAS_NUM_THREADS'] = '1'
+os.environ['MKL_NUM_THREADS'] = '1'
+os.environ['VECLIB_MAXIMUM_THREADS'] = '1'
+os.environ['NUMEXPR_NUM_THREADS'] = '1'
+import cv2
+# import hydra
+import numpy as np
+import torch
+import tqdm
+import yaml
+from omegaconf import OmegaConf
+from torch.utils.data._utils.collate import default_collate
+from saicinpainting.training.data.datasets import make_default_val_dataset
+from saicinpainting.training.trainers import load_checkpoint
+from saicinpainting.utils import register_debug_signal_handlers
+LOGGER = logging.getLogger(__name__)
+# @hydra.main(config_path='../configs/prediction', config_name='default.yaml')
+def main(predict_config: OmegaConf):
+    try:
+        register_debug_signal_handlers()  # kill -10 <pid> will result in traceback dumped into log
+        device = torch.device(predict_config.device)
+        print(predict_config)
+        train_config_path = os.path.join(predict_config.model.path, 'config.yaml')
+        with open(train_config_path, 'r') as f:
+            train_config = OmegaConf.create(yaml.safe_load(f))
+        train_config.training_model.predict_only = True
+        train_config.visualizer.kind = 'noop'
+        out_ext = predict_config.get('out_ext', '.png')
+        checkpoint_path = os.path.join(predict_config.model.path,
+                                       'models',
+                                       predict_config.model.checkpoint)
+        model = load_checkpoint(train_config, checkpoint_path, strict=False, map_location='cpu')
+        model.freeze()
+        model.to(device)
+        if not predict_config.indir.endswith('/'):
+            predict_config.indir += '/'
+        dataset = make_default_val_dataset(predict_config.indir, **predict_config.dataset)
+        with torch.no_grad():
+            for img_i in tqdm.trange(len(dataset)):
+                mask_fname = dataset.mask_filenames[img_i]
+                cur_out_fname = os.path.join(
+                    predict_config.outdir,
+                    os.path.splitext(os.path.basename(mask_fname))[0] + out_ext
+                )
+                os.makedirs(os.path.dirname(cur_out_fname), exist_ok=True)
+                batch = move_to_device(default_collate([dataset[img_i]]), device)
+                batch['mask'] = (batch['mask'] > 0) * 1
+                # print(torch.max(batch['mask']), torch.min(batch['mask']), torch.max(batch['image']), torch.min(batch['image']))
+                print(batch['mask'].dtype)
+                batch = model(batch)
+                cur_res = batch[predict_config.out_key][0].permute(1, 2, 0).detach().cpu().numpy()
+                cur_res = np.clip(cur_res * 255, 0, 255).astype('uint8')
+                cur_res = cv2.cvtColor(cur_res, cv2.COLOR_RGB2BGR)
+                cv2.imwrite(cur_out_fname, cur_res)
+    except KeyboardInterrupt:
+        LOGGER.warning('Interrupted by user')
+    except Exception as ex:
+        LOGGER.critical(f'Prediction failed due to {ex}:\n{traceback.format_exc()}')
+        sys.exit(1)
+if __name__ == '__main__':
+    base_conf = OmegaConf.load('configs/prediction/default.yaml')
+    base_conf.model.path='../../weights/big-lama'
+    base_conf.indir='/home/zwp/Temp2018/ZWP/data/example/照片补全/测试图片/带mask的图片2'
+    base_conf.outdir='zb_result/带mask的图片2'
+    main(predict_config=base_conf)

inpaint/saicinpainting/training/modules/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from saicinpainting.training.modules.ffc import FFCResNetGenerator
+def make_generator(config, kind, **kwargs):
+    return FFCResNetGenerator(**kwargs)

inpaint/saicinpainting/training/modules/base.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import abc
+from typing import Tuple, List
+import torch
+import torch.nn as nn
+from saicinpainting.training.modules.depthwise_sep_conv import DepthWiseSeperableConv
+from saicinpainting.training.modules.multidilated_conv import MultidilatedConv
+class BaseDiscriminator(nn.Module):
+    @abc.abstractmethod
+    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, List[torch.Tensor]]:
+        """
+        Predict scores and get intermediate activations. Useful for feature matching loss
+        :return tuple (scores, list of intermediate activations)
+        """
+        raise NotImplemented()
+def get_conv_block_ctor(kind='default'):
+    if not isinstance(kind, str):
+        return kind
+    if kind == 'default':
+        return nn.Conv2d
+    if kind == 'depthwise':
+        return DepthWiseSeperableConv
+    if kind == 'multidilated':
+        return MultidilatedConv
+    raise ValueError(f'Unknown convolutional block kind {kind}')
+def get_norm_layer(kind='bn'):
+    if not isinstance(kind, str):
+        return kind
+    if kind == 'bn':
+        return nn.BatchNorm2d
+    if kind == 'in':
+        return nn.InstanceNorm2d
+    raise ValueError(f'Unknown norm block kind {kind}')
+def get_activation(kind='tanh'):
+    if kind == 'tanh':
+        return nn.Tanh()
+    if kind == 'sigmoid':
+        return nn.Sigmoid()
+    if kind is False:
+        return nn.Identity()
+    raise ValueError(f'Unknown activation kind {kind}')
+class SimpleMultiStepGenerator(nn.Module):
+    def __init__(self, steps: List[nn.Module]):
+        super().__init__()
+        self.steps = nn.ModuleList(steps)
+    def forward(self, x):
+        cur_in = x
+        outs = []
+        for step in self.steps:
+            cur_out = step(cur_in)
+            outs.append(cur_out)
+            cur_in = torch.cat((cur_in, cur_out), dim=1)
+        return torch.cat(outs[::-1], dim=1)
+def deconv_factory(kind, ngf, mult, norm_layer, activation, max_features):
+    if kind == 'convtranspose':
+        return [nn.ConvTranspose2d(min(max_features, ngf * mult),
+                    min(max_features, int(ngf * mult / 2)),
+                    kernel_size=3, stride=2, padding=1, output_padding=1),
+                    norm_layer(min(max_features, int(ngf * mult / 2))), activation]
+    elif kind == 'bilinear':
+        return [nn.Upsample(scale_factor=2, mode='bilinear'),
+                DepthWiseSeperableConv(min(max_features, ngf * mult),
+                    min(max_features, int(ngf * mult / 2)),
+                    kernel_size=3, stride=1, padding=1),
+                norm_layer(min(max_features, int(ngf * mult / 2))), activation]
+    else:
+        raise Exception(f"Invalid deconv kind: {kind}")

inpaint/saicinpainting/training/modules/depthwise_sep_conv.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import torch
+import torch.nn as nn
+class DepthWiseSeperableConv(nn.Module):
+    def __init__(self, in_dim, out_dim, *args, **kwargs):
+        super().__init__()
+        if 'groups' in kwargs:
+            # ignoring groups for Depthwise Sep Conv
+            del kwargs['groups']
+        self.depthwise = nn.Conv2d(in_dim, in_dim, *args, groups=in_dim, **kwargs)
+        self.pointwise = nn.Conv2d(in_dim, out_dim, kernel_size=1)
+    def forward(self, x):
+        out = self.depthwise(x)
+        out = self.pointwise(out)
+        return out

inpaint/saicinpainting/training/modules/fake_fakes.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import torch
+from kornia import SamplePadding
+from kornia.augmentation import RandomAffine, CenterCrop
+class FakeFakesGenerator:
+    def __init__(self, aug_proba=0.5, img_aug_degree=30, img_aug_translate=0.2):
+        self.grad_aug = RandomAffine(degrees=360,
+                                     translate=0.2,
+                                     padding_mode=SamplePadding.REFLECTION,
+                                     keepdim=False,
+                                     p=1)
+        self.img_aug = RandomAffine(degrees=img_aug_degree,
+                                    translate=img_aug_translate,
+                                    padding_mode=SamplePadding.REFLECTION,
+                                    keepdim=True,
+                                    p=1)
+        self.aug_proba = aug_proba
+    def __call__(self, input_images, masks):
+        blend_masks = self._fill_masks_with_gradient(masks)
+        blend_target = self._make_blend_target(input_images)
+        result = input_images * (1 - blend_masks) + blend_target * blend_masks
+        return result, blend_masks
+    def _make_blend_target(self, input_images):
+        batch_size = input_images.shape[0]
+        permuted = input_images[torch.randperm(batch_size)]
+        augmented = self.img_aug(input_images)
+        is_aug = (torch.rand(batch_size, device=input_images.device)[:, None, None, None] < self.aug_proba).float()
+        result = augmented * is_aug + permuted * (1 - is_aug)
+        return result
+    def _fill_masks_with_gradient(self, masks):
+        batch_size, _, height, width = masks.shape
+        grad = torch.linspace(0, 1, steps=width * 2, device=masks.device, dtype=masks.dtype) \
+            .view(1, 1, 1, -1).expand(batch_size, 1, height * 2, width * 2)
+        grad = self.grad_aug(grad)
+        grad = CenterCrop((height, width))(grad)
+        grad *= masks
+        grad_for_min = grad + (1 - masks) * 10
+        grad -= grad_for_min.view(batch_size, -1).min(-1).values[:, None, None, None]
+        grad /= grad.view(batch_size, -1).max(-1).values[:, None, None, None] + 1e-6
+        grad.clamp_(min=0, max=1)
+        return grad

inpaint/saicinpainting/training/modules/ffc.py ADDED Viewed

	@@ -0,0 +1,367 @@

+# Fast Fourier Convolution NeurIPS 2020
+# original implementation https://github.com/pkumivision/FFC/blob/main/model_zoo/ffc.py
+# paper https://proceedings.neurips.cc/paper/2020/file/2fd5d41ec6cfab47e32164d5624269b1-Paper.pdf
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from saicinpainting.training.modules.base import get_activation
+from saicinpainting.training.modules.spatial_transform import LearnableSpatialTransformWrapper
+from saicinpainting.training.modules.squeeze_excitation import SELayer
+class FFCSE_block(nn.Module):
+    def __init__(self, channels, ratio_g):
+        super(FFCSE_block, self).__init__()
+        in_cg = int(channels * ratio_g)
+        in_cl = channels - in_cg
+        r = 16
+        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
+        self.conv1 = nn.Conv2d(channels, channels // r,
+                               kernel_size=1, bias=True)
+        self.relu1 = nn.ReLU(inplace=True)
+        self.conv_a2l = None if in_cl == 0 else nn.Conv2d(
+            channels // r, in_cl, kernel_size=1, bias=True)
+        self.conv_a2g = None if in_cg == 0 else nn.Conv2d(
+            channels // r, in_cg, kernel_size=1, bias=True)
+        self.sigmoid = nn.Sigmoid()
+    def forward(self, x):
+        x = x if type(x) is tuple else (x, 0)
+        id_l, id_g = x
+        x = id_l if type(id_g) is int else torch.cat([id_l, id_g], dim=1)
+        x = self.avgpool(x)
+        x = self.relu1(self.conv1(x))
+        x_l = 0 if self.conv_a2l is None else id_l * \
+            self.sigmoid(self.conv_a2l(x))
+        x_g = 0 if self.conv_a2g is None else id_g * \
+            self.sigmoid(self.conv_a2g(x))
+        return x_l, x_g
+class FourierUnit(nn.Module):
+    def __init__(self, in_channels, out_channels, groups=1, spatial_scale_factor=None, spatial_scale_mode='bilinear',
+                 spectral_pos_encoding=False, use_se=False, se_kwargs=None, ffc3d=False, fft_norm='ortho'):
+        # bn_layer not used
+        super(FourierUnit, self).__init__()
+        self.groups = groups
+        self.conv_layer = torch.nn.Conv2d(in_channels=in_channels * 2 + (2 if spectral_pos_encoding else 0),
+                                          out_channels=out_channels * 2,
+                                          kernel_size=1, stride=1, padding=0, groups=self.groups, bias=False)
+        self.bn = torch.nn.BatchNorm2d(out_channels * 2)
+        self.relu = torch.nn.ReLU(inplace=True)
+        # squeeze and excitation block
+        self.use_se = use_se
+        if use_se:
+            if se_kwargs is None:
+                se_kwargs = {}
+            self.se = SELayer(self.conv_layer.in_channels, **se_kwargs)
+        self.spatial_scale_factor = spatial_scale_factor
+        self.spatial_scale_mode = spatial_scale_mode
+        self.spectral_pos_encoding = spectral_pos_encoding
+        self.ffc3d = ffc3d
+        self.fft_norm = fft_norm
+    def forward(self, x):
+        batch = x.shape[0]
+        if self.spatial_scale_factor is not None:
+            orig_size = x.shape[-2:]
+            x = F.interpolate(x, scale_factor=self.spatial_scale_factor, mode=self.spatial_scale_mode, align_corners=False)
+        r_size = x.size()
+        # (batch, c, h, w/2+1, 2)
+        fft_dim = (-3, -2, -1) if self.ffc3d else (-2, -1)
+        ffted = torch.fft.rfftn(x, dim=fft_dim, norm=self.fft_norm)
+        ffted = torch.stack((ffted.real, ffted.imag), dim=-1)
+        ffted = ffted.permute(0, 1, 4, 2, 3).contiguous()  # (batch, c, 2, h, w/2+1)
+        ffted = ffted.view((batch, -1,) + ffted.size()[3:])
+        if self.spectral_pos_encoding:
+            height, width = ffted.shape[-2:]
+            coords_vert = torch.linspace(0, 1, height)[None, None, :, None].expand(batch, 1, height, width).to(ffted)
+            coords_hor = torch.linspace(0, 1, width)[None, None, None, :].expand(batch, 1, height, width).to(ffted)
+            ffted = torch.cat((coords_vert, coords_hor, ffted), dim=1)
+        if self.use_se:
+            ffted = self.se(ffted)
+        ffted = self.conv_layer(ffted)  # (batch, c*2, h, w/2+1)
+        ffted = self.relu(self.bn(ffted))
+        ffted = ffted.view((batch, -1, 2,) + ffted.size()[2:]).permute(
+            0, 1, 3, 4, 2).contiguous()  # (batch,c, t, h, w/2+1, 2)
+        ffted = torch.complex(ffted[..., 0], ffted[..., 1])
+        ifft_shape_slice = x.shape[-3:] if self.ffc3d else x.shape[-2:]
+        output = torch.fft.irfftn(ffted, s=ifft_shape_slice, dim=fft_dim, norm=self.fft_norm)
+        if self.spatial_scale_factor is not None:
+            output = F.interpolate(output, size=orig_size, mode=self.spatial_scale_mode, align_corners=False)
+        return output
+class SpectralTransform(nn.Module):
+    def __init__(self, in_channels, out_channels, stride=1, groups=1, enable_lfu=True, **fu_kwargs):
+        # bn_layer not used
+        super(SpectralTransform, self).__init__()
+        self.enable_lfu = enable_lfu
+        if stride == 2:
+            self.downsample = nn.AvgPool2d(kernel_size=(2, 2), stride=2)
+        else:
+            self.downsample = nn.Identity()
+        self.stride = stride
+        self.conv1 = nn.Sequential(
+            nn.Conv2d(in_channels, out_channels //
+                      2, kernel_size=1, groups=groups, bias=False),
+            nn.BatchNorm2d(out_channels // 2),
+            nn.ReLU(inplace=True)
+        )
+        self.fu = FourierUnit(
+            out_channels // 2, out_channels // 2, groups, **fu_kwargs)
+        if self.enable_lfu:
+            self.lfu = FourierUnit(
+                out_channels // 2, out_channels // 2, groups)
+        self.conv2 = torch.nn.Conv2d(
+            out_channels // 2, out_channels, kernel_size=1, groups=groups, bias=False)
+    def forward(self, x):
+        x = self.downsample(x)
+        x = self.conv1(x)
+        output = self.fu(x)
+        if self.enable_lfu:
+            n, c, h, w = x.shape
+            split_no = 2
+            split_s = h // split_no
+            xs = torch.cat(torch.split(
+                x[:, :c // 4], split_s, dim=-2), dim=1).contiguous()
+            xs = torch.cat(torch.split(xs, split_s, dim=-1),
+                           dim=1).contiguous()
+            xs = self.lfu(xs)
+            xs = xs.repeat(1, 1, split_no, split_no).contiguous()
+        else:
+            xs = 0
+        output = self.conv2(x + output + xs)
+        return output
+class FFC(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size,
+                 ratio_gin, ratio_gout, stride=1, padding=0,
+                 dilation=1, groups=1, bias=False, enable_lfu=True,
+                 padding_type='reflect', gated=False, **spectral_kwargs):
+        super(FFC, self).__init__()
+        assert stride == 1 or stride == 2, "Stride should be 1 or 2."
+        self.stride = stride
+        in_cg = int(in_channels * ratio_gin)
+        in_cl = in_channels - in_cg
+        out_cg = int(out_channels * ratio_gout)
+        out_cl = out_channels - out_cg
+        #groups_g = 1 if groups == 1 else int(groups * ratio_gout)
+        #groups_l = 1 if groups == 1 else groups - groups_g
+        self.ratio_gin = ratio_gin
+        self.ratio_gout = ratio_gout
+        self.global_in_num = in_cg
+        module = nn.Identity if in_cl == 0 or out_cl == 0 else nn.Conv2d
+        self.convl2l = module(in_cl, out_cl, kernel_size,
+                              stride, padding, dilation, groups, bias, padding_mode=padding_type)
+        module = nn.Identity if in_cl == 0 or out_cg == 0 else nn.Conv2d
+        self.convl2g = module(in_cl, out_cg, kernel_size,
+                              stride, padding, dilation, groups, bias, padding_mode=padding_type)
+        module = nn.Identity if in_cg == 0 or out_cl == 0 else nn.Conv2d
+        self.convg2l = module(in_cg, out_cl, kernel_size,
+                              stride, padding, dilation, groups, bias, padding_mode=padding_type)
+        module = nn.Identity if in_cg == 0 or out_cg == 0 else SpectralTransform
+        self.convg2g = module(
+            in_cg, out_cg, stride, 1 if groups == 1 else groups // 2, enable_lfu, **spectral_kwargs)
+        self.gated = gated
+        module = nn.Identity if in_cg == 0 or out_cl == 0 or not self.gated else nn.Conv2d
+        self.gate = module(in_channels, 2, 1)
+    def forward(self, x):
+        x_l, x_g = x if type(x) is tuple else (x, 0)
+        out_xl, out_xg = 0, 0
+        if self.gated:
+            total_input_parts = [x_l]
+            if torch.is_tensor(x_g):
+                total_input_parts.append(x_g)
+            total_input = torch.cat(total_input_parts, dim=1)
+            gates = torch.sigmoid(self.gate(total_input))
+            g2l_gate, l2g_gate = gates.chunk(2, dim=1)
+        else:
+            g2l_gate, l2g_gate = 1, 1
+        if self.ratio_gout != 1:
+            out_xl = self.convl2l(x_l) + self.convg2l(x_g) * g2l_gate
+        if self.ratio_gout != 0:
+            out_xg = self.convl2g(x_l) * l2g_gate + self.convg2g(x_g)
+        return out_xl, out_xg
+class FFC_BN_ACT(nn.Module):
+    def __init__(self, in_channels, out_channels,
+                 kernel_size, ratio_gin, ratio_gout,
+                 stride=1, padding=0, dilation=1, groups=1, bias=False,
+                 norm_layer=nn.BatchNorm2d, activation_layer=nn.Identity,
+                 padding_type='reflect',
+                 enable_lfu=True, **kwargs):
+        super(FFC_BN_ACT, self).__init__()
+        self.ffc = FFC(in_channels, out_channels, kernel_size,
+                       ratio_gin, ratio_gout, stride, padding, dilation,
+                       groups, bias, enable_lfu, padding_type=padding_type, **kwargs)
+        lnorm = nn.Identity if ratio_gout == 1 else norm_layer
+        gnorm = nn.Identity if ratio_gout == 0 else norm_layer
+        global_channels = int(out_channels * ratio_gout)
+        self.bn_l = lnorm(out_channels - global_channels)
+        self.bn_g = gnorm(global_channels)
+        lact = nn.Identity if ratio_gout == 1 else activation_layer
+        gact = nn.Identity if ratio_gout == 0 else activation_layer
+        self.act_l = lact(inplace=True)
+        self.act_g = gact(inplace=True)
+    def forward(self, x):
+        x_l, x_g = self.ffc(x)
+        x_l = self.act_l(self.bn_l(x_l))
+        x_g = self.act_g(self.bn_g(x_g))
+        return x_l, x_g
+class FFCResnetBlock(nn.Module):
+    def __init__(self, dim, padding_type, norm_layer, activation_layer=nn.ReLU, dilation=1,
+                 spatial_transform_kwargs=None, inline=False, **conv_kwargs):
+        super().__init__()
+        self.conv1 = FFC_BN_ACT(dim, dim, kernel_size=3, padding=dilation, dilation=dilation,
+                                norm_layer=norm_layer,
+                                activation_layer=activation_layer,
+                                padding_type=padding_type,
+                                **conv_kwargs)
+        self.conv2 = FFC_BN_ACT(dim, dim, kernel_size=3, padding=dilation, dilation=dilation,
+                                norm_layer=norm_layer,
+                                activation_layer=activation_layer,
+                                padding_type=padding_type,
+                                **conv_kwargs)
+        if spatial_transform_kwargs is not None:
+            self.conv1 = LearnableSpatialTransformWrapper(self.conv1, **spatial_transform_kwargs)
+            self.conv2 = LearnableSpatialTransformWrapper(self.conv2, **spatial_transform_kwargs)
+        self.inline = inline
+    def forward(self, x):
+        if self.inline:
+            x_l, x_g = x[:, :-self.conv1.ffc.global_in_num], x[:, -self.conv1.ffc.global_in_num:]
+        else:
+            x_l, x_g = x if type(x) is tuple else (x, 0)
+        id_l, id_g = x_l, x_g
+        x_l, x_g = self.conv1((x_l, x_g))
+        x_l, x_g = self.conv2((x_l, x_g))
+        x_l, x_g = id_l + x_l, id_g + x_g
+        out = x_l, x_g
+        if self.inline:
+            out = torch.cat(out, dim=1)
+        return out
+class ConcatTupleLayer(nn.Module):
+    def forward(self, x):
+        assert isinstance(x, tuple)
+        x_l, x_g = x
+        assert torch.is_tensor(x_l) or torch.is_tensor(x_g)
+        if not torch.is_tensor(x_g):
+            return x_l
+        return torch.cat(x, dim=1)
+class FFCResNetGenerator(nn.Module):
+    def __init__(self, input_nc, output_nc, ngf=64, n_downsampling=3, n_blocks=9, norm_layer=nn.BatchNorm2d,
+                 padding_type='reflect', activation_layer=nn.ReLU,
+                 up_norm_layer=nn.BatchNorm2d, up_activation=nn.ReLU(True),
+                 init_conv_kwargs={}, downsample_conv_kwargs={}, resnet_conv_kwargs={},
+                 spatial_transform_layers=None, spatial_transform_kwargs={},
+                 add_out_act=True, max_features=1024, out_ffc=False, out_ffc_kwargs={}):
+        assert (n_blocks >= 0)
+        super().__init__()
+        model = [nn.ReflectionPad2d(3),
+                 FFC_BN_ACT(input_nc, ngf, kernel_size=7, padding=0, norm_layer=norm_layer,
+                            activation_layer=activation_layer, **init_conv_kwargs)]
+        ### downsample
+        for i in range(n_downsampling):
+            mult = 2 ** i
+            if i == n_downsampling - 1:
+                cur_conv_kwargs = dict(downsample_conv_kwargs)
+                cur_conv_kwargs['ratio_gout'] = resnet_conv_kwargs.get('ratio_gin', 0)
+            else:
+                cur_conv_kwargs = downsample_conv_kwargs
+            model += [FFC_BN_ACT(min(max_features, ngf * mult),
+                                 min(max_features, ngf * mult * 2),
+                                 kernel_size=3, stride=2, padding=1,
+                                 norm_layer=norm_layer,
+                                 activation_layer=activation_layer,
+                                 **cur_conv_kwargs)]
+        mult = 2 ** n_downsampling
+        feats_num_bottleneck = min(max_features, ngf * mult)
+        ### resnet blocks
+        for i in range(n_blocks):
+            cur_resblock = FFCResnetBlock(feats_num_bottleneck, padding_type=padding_type, activation_layer=activation_layer,
+                                          norm_layer=norm_layer, **resnet_conv_kwargs)
+            if spatial_transform_layers is not None and i in spatial_transform_layers:
+                cur_resblock = LearnableSpatialTransformWrapper(cur_resblock, **spatial_transform_kwargs)
+            model += [cur_resblock]
+        model += [ConcatTupleLayer()]
+        ### upsample
+        for i in range(n_downsampling):
+            mult = 2 ** (n_downsampling - i)
+            model += [nn.ConvTranspose2d(min(max_features, ngf * mult),
+                                         min(max_features, int(ngf * mult / 2)),
+                                         kernel_size=3, stride=2, padding=1, output_padding=1),
+                      up_norm_layer(min(max_features, int(ngf * mult / 2))),
+                      up_activation]
+        if out_ffc:
+            model += [FFCResnetBlock(ngf, padding_type=padding_type, activation_layer=activation_layer,
+                                     norm_layer=norm_layer, inline=True, **out_ffc_kwargs)]
+        model += [nn.ReflectionPad2d(3),
+                  nn.Conv2d(ngf, output_nc, kernel_size=7, padding=0)]
+        if add_out_act:
+            model.append(get_activation('tanh' if add_out_act is True else add_out_act))
+        self.model = nn.Sequential(*model)
+    def forward(self, input):
+        return self.model(input)

inpaint/saicinpainting/training/modules/multidilated_conv.py ADDED Viewed

	@@ -0,0 +1,98 @@

+import torch
+import torch.nn as nn
+import random
+from saicinpainting.training.modules.depthwise_sep_conv import DepthWiseSeperableConv
+class MultidilatedConv(nn.Module):
+    def __init__(self, in_dim, out_dim, kernel_size, dilation_num=3, comb_mode='sum', equal_dim=True,
+                 shared_weights=False, padding=1, min_dilation=1, shuffle_in_channels=False, use_depthwise=False, **kwargs):
+        super().__init__()
+        convs = []
+        self.equal_dim = equal_dim
+        assert comb_mode in ('cat_out', 'sum', 'cat_in', 'cat_both'), comb_mode
+        if comb_mode in ('cat_out', 'cat_both'):
+            self.cat_out = True
+            if equal_dim:
+                assert out_dim % dilation_num == 0
+                out_dims = [out_dim // dilation_num] * dilation_num
+                self.index = sum([[i + j * (out_dims[0]) for j in range(dilation_num)] for i in range(out_dims[0])], [])
+            else:
+                out_dims = [out_dim // 2 ** (i + 1) for i in range(dilation_num - 1)]
+                out_dims.append(out_dim - sum(out_dims))
+                index = []
+                starts = [0] + out_dims[:-1]
+                lengths = [out_dims[i] // out_dims[-1] for i in range(dilation_num)]
+                for i in range(out_dims[-1]):
+                    for j in range(dilation_num):
+                        index += list(range(starts[j], starts[j] + lengths[j]))
+                        starts[j] += lengths[j]
+                self.index = index
+                assert(len(index) == out_dim)
+            self.out_dims = out_dims
+        else:
+            self.cat_out = False
+            self.out_dims = [out_dim] * dilation_num
+        if comb_mode in ('cat_in', 'cat_both'):
+            if equal_dim:
+                assert in_dim % dilation_num == 0
+                in_dims = [in_dim // dilation_num] * dilation_num
+            else:
+                in_dims = [in_dim // 2 ** (i + 1) for i in range(dilation_num - 1)]
+                in_dims.append(in_dim - sum(in_dims))
+            self.in_dims = in_dims
+            self.cat_in = True
+        else:
+            self.cat_in = False
+            self.in_dims = [in_dim] * dilation_num
+        conv_type = DepthWiseSeperableConv if use_depthwise else nn.Conv2d
+        dilation = min_dilation
+        for i in range(dilation_num):
+            if isinstance(padding, int):
+                cur_padding = padding * dilation
+            else:
+                cur_padding = padding[i]
+            convs.append(conv_type(
+                self.in_dims[i], self.out_dims[i], kernel_size, padding=cur_padding, dilation=dilation, **kwargs
+            ))
+            if i > 0 and shared_weights:
+                convs[-1].weight = convs[0].weight
+                convs[-1].bias = convs[0].bias
+            dilation *= 2
+        self.convs = nn.ModuleList(convs)
+        self.shuffle_in_channels = shuffle_in_channels
+        if self.shuffle_in_channels:
+            # shuffle list as shuffling of tensors is nondeterministic
+            in_channels_permute = list(range(in_dim))
+            random.shuffle(in_channels_permute)
+            # save as buffer so it is saved and loaded with checkpoint
+            self.register_buffer('in_channels_permute', torch.tensor(in_channels_permute))
+    def forward(self, x):
+        if self.shuffle_in_channels:
+            x = x[:, self.in_channels_permute]
+        outs = []
+        if self.cat_in:
+            if self.equal_dim:
+                x = x.chunk(len(self.convs), dim=1)
+            else:
+                new_x = []
+                start = 0
+                for dim in self.in_dims:
+                    new_x.append(x[:, start:start+dim])
+                    start += dim
+                x = new_x
+        for i, conv in enumerate(self.convs):
+            if self.cat_in:
+                input = x[i]
+            else:
+                input = x
+            outs.append(conv(input))
+        if self.cat_out:
+            out = torch.cat(outs, dim=1)[:, self.index]
+        else:
+            out = sum(outs)
+        return out

inpaint/saicinpainting/training/modules/multiscale.py ADDED Viewed

	@@ -0,0 +1,244 @@

+from typing import List, Tuple, Union, Optional
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from saicinpainting.training.modules.base import get_conv_block_ctor, get_activation
+from saicinpainting.training.modules.pix2pixhd import ResnetBlock
+class ResNetHead(nn.Module):
+    def __init__(self, input_nc, ngf=64, n_downsampling=3, n_blocks=9, norm_layer=nn.BatchNorm2d,
+                 padding_type='reflect', conv_kind='default', activation=nn.ReLU(True)):
+        assert (n_blocks >= 0)
+        super(ResNetHead, self).__init__()
+        conv_layer = get_conv_block_ctor(conv_kind)
+        model = [nn.ReflectionPad2d(3),
+                 conv_layer(input_nc, ngf, kernel_size=7, padding=0),
+                 norm_layer(ngf),
+                 activation]
+        ### downsample
+        for i in range(n_downsampling):
+            mult = 2 ** i
+            model += [conv_layer(ngf * mult, ngf * mult * 2, kernel_size=3, stride=2, padding=1),
+                      norm_layer(ngf * mult * 2),
+                      activation]
+        mult = 2 ** n_downsampling
+        ### resnet blocks
+        for i in range(n_blocks):
+            model += [ResnetBlock(ngf * mult, padding_type=padding_type, activation=activation, norm_layer=norm_layer,
+                                  conv_kind=conv_kind)]
+        self.model = nn.Sequential(*model)
+    def forward(self, input):
+        return self.model(input)
+class ResNetTail(nn.Module):
+    def __init__(self, output_nc, ngf=64, n_downsampling=3, n_blocks=9, norm_layer=nn.BatchNorm2d,
+                 padding_type='reflect', conv_kind='default', activation=nn.ReLU(True),
+                 up_norm_layer=nn.BatchNorm2d, up_activation=nn.ReLU(True), add_out_act=False, out_extra_layers_n=0,
+                 add_in_proj=None):
+        assert (n_blocks >= 0)
+        super(ResNetTail, self).__init__()
+        mult = 2 ** n_downsampling
+        model = []
+        if add_in_proj is not None:
+            model.append(nn.Conv2d(add_in_proj, ngf * mult, kernel_size=1))
+        ### resnet blocks
+        for i in range(n_blocks):
+            model += [ResnetBlock(ngf * mult, padding_type=padding_type, activation=activation, norm_layer=norm_layer,
+                                  conv_kind=conv_kind)]
+        ### upsample
+        for i in range(n_downsampling):
+            mult = 2 ** (n_downsampling - i)
+            model += [nn.ConvTranspose2d(ngf * mult, int(ngf * mult / 2), kernel_size=3, stride=2, padding=1,
+                                         output_padding=1),
+                      up_norm_layer(int(ngf * mult / 2)),
+                      up_activation]
+        self.model = nn.Sequential(*model)
+        out_layers = []
+        for _ in range(out_extra_layers_n):
+            out_layers += [nn.Conv2d(ngf, ngf, kernel_size=1, padding=0),
+                           up_norm_layer(ngf),
+                           up_activation]
+        out_layers += [nn.ReflectionPad2d(3),
+                       nn.Conv2d(ngf, output_nc, kernel_size=7, padding=0)]
+        if add_out_act:
+            out_layers.append(get_activation('tanh' if add_out_act is True else add_out_act))
+        self.out_proj = nn.Sequential(*out_layers)
+    def forward(self, input, return_last_act=False):
+        features = self.model(input)
+        out = self.out_proj(features)
+        if return_last_act:
+            return out, features
+        else:
+            return out
+class MultiscaleResNet(nn.Module):
+    def __init__(self, input_nc, output_nc, ngf=64, n_downsampling=2, n_blocks_head=2, n_blocks_tail=6, n_scales=3,
+                 norm_layer=nn.BatchNorm2d, padding_type='reflect', conv_kind='default', activation=nn.ReLU(True),
+                 up_norm_layer=nn.BatchNorm2d, up_activation=nn.ReLU(True), add_out_act=False, out_extra_layers_n=0,
+                 out_cumulative=False, return_only_hr=False):
+        super().__init__()
+        self.heads = nn.ModuleList([ResNetHead(input_nc, ngf=ngf, n_downsampling=n_downsampling,
+                                               n_blocks=n_blocks_head, norm_layer=norm_layer, padding_type=padding_type,
+                                               conv_kind=conv_kind, activation=activation)
+                                    for i in range(n_scales)])
+        tail_in_feats = ngf * (2 ** n_downsampling) + ngf
+        self.tails = nn.ModuleList([ResNetTail(output_nc,
+                                               ngf=ngf, n_downsampling=n_downsampling,
+                                               n_blocks=n_blocks_tail, norm_layer=norm_layer, padding_type=padding_type,
+                                               conv_kind=conv_kind, activation=activation, up_norm_layer=up_norm_layer,
+                                               up_activation=up_activation, add_out_act=add_out_act,
+                                               out_extra_layers_n=out_extra_layers_n,
+                                               add_in_proj=None if (i == n_scales - 1) else tail_in_feats)
+                                    for i in range(n_scales)])
+        self.out_cumulative = out_cumulative
+        self.return_only_hr = return_only_hr
+    @property
+    def num_scales(self):
+        return len(self.heads)
+    def forward(self, ms_inputs: List[torch.Tensor], smallest_scales_num: Optional[int] = None) \
+        -> Union[torch.Tensor, List[torch.Tensor]]:
+        """
+        :param ms_inputs: List of inputs of different resolutions from HR to LR
+        :param smallest_scales_num: int or None, number of smallest scales to take at input
+        :return: Depending on return_only_hr:
+            True: Only the most HR output
+            False: List of outputs of different resolutions from HR to LR
+        """
+        if smallest_scales_num is None:
+            assert len(self.heads) == len(ms_inputs), (len(self.heads), len(ms_inputs), smallest_scales_num)
+            smallest_scales_num = len(self.heads)
+        else:
+            assert smallest_scales_num == len(ms_inputs) <= len(self.heads), (len(self.heads), len(ms_inputs), smallest_scales_num)
+        cur_heads = self.heads[-smallest_scales_num:]
+        ms_features = [cur_head(cur_inp) for cur_head, cur_inp in zip(cur_heads, ms_inputs)]
+        all_outputs = []
+        prev_tail_features = None
+        for i in range(len(ms_features)):
+            scale_i = -i - 1
+            cur_tail_input = ms_features[-i - 1]
+            if prev_tail_features is not None:
+                if prev_tail_features.shape != cur_tail_input.shape:
+                    prev_tail_features = F.interpolate(prev_tail_features, size=cur_tail_input.shape[2:],
+                                                       mode='bilinear', align_corners=False)
+                cur_tail_input = torch.cat((cur_tail_input, prev_tail_features), dim=1)
+            cur_out, cur_tail_feats = self.tails[scale_i](cur_tail_input, return_last_act=True)
+            prev_tail_features = cur_tail_feats
+            all_outputs.append(cur_out)
+        if self.out_cumulative:
+            all_outputs_cum = [all_outputs[0]]
+            for i in range(1, len(ms_features)):
+                cur_out = all_outputs[i]
+                cur_out_cum = cur_out + F.interpolate(all_outputs_cum[-1], size=cur_out.shape[2:],
+                                                      mode='bilinear', align_corners=False)
+                all_outputs_cum.append(cur_out_cum)
+            all_outputs = all_outputs_cum
+        if self.return_only_hr:
+            return all_outputs[-1]
+        else:
+            return all_outputs[::-1]
+class MultiscaleDiscriminatorSimple(nn.Module):
+    def __init__(self, ms_impl):
+        super().__init__()
+        self.ms_impl = nn.ModuleList(ms_impl)
+    @property
+    def num_scales(self):
+        return len(self.ms_impl)
+    def forward(self, ms_inputs: List[torch.Tensor], smallest_scales_num: Optional[int] = None) \
+            -> List[Tuple[torch.Tensor, List[torch.Tensor]]]:
+        """
+        :param ms_inputs: List of inputs of different resolutions from HR to LR
+        :param smallest_scales_num: int or None, number of smallest scales to take at input
+        :return: List of pairs (prediction, features) for different resolutions from HR to LR
+        """
+        if smallest_scales_num is None:
+            assert len(self.ms_impl) == len(ms_inputs), (len(self.ms_impl), len(ms_inputs), smallest_scales_num)
+            smallest_scales_num = len(self.heads)
+        else:
+            assert smallest_scales_num == len(ms_inputs) <= len(self.ms_impl), \
+                (len(self.ms_impl), len(ms_inputs), smallest_scales_num)
+        return [cur_discr(cur_input) for cur_discr, cur_input in zip(self.ms_impl[-smallest_scales_num:], ms_inputs)]
+class SingleToMultiScaleInputMixin:
+    def forward(self, x: torch.Tensor) -> List:
+        orig_height, orig_width = x.shape[2:]
+        factors = [2 ** i for i in range(self.num_scales)]
+        ms_inputs = [F.interpolate(x, size=(orig_height // f, orig_width // f), mode='bilinear', align_corners=False)
+                     for f in factors]
+        return super().forward(ms_inputs)
+class GeneratorMultiToSingleOutputMixin:
+    def forward(self, x):
+        return super().forward(x)[0]
+class DiscriminatorMultiToSingleOutputMixin:
+    def forward(self, x):
+        out_feat_tuples = super().forward(x)
+        return out_feat_tuples[0][0], [f for _, flist in out_feat_tuples for f in flist]
+class DiscriminatorMultiToSingleOutputStackedMixin:
+    def __init__(self, *args, return_feats_only_levels=None, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.return_feats_only_levels = return_feats_only_levels
+    def forward(self, x):
+        out_feat_tuples = super().forward(x)
+        outs = [out for out, _ in out_feat_tuples]
+        scaled_outs = [outs[0]] + [F.interpolate(cur_out, size=outs[0].shape[-2:],
+                                                 mode='bilinear', align_corners=False)
+                                   for cur_out in outs[1:]]
+        out = torch.cat(scaled_outs, dim=1)
+        if self.return_feats_only_levels is not None:
+            feat_lists = [out_feat_tuples[i][1] for i in self.return_feats_only_levels]
+        else:
+            feat_lists = [flist for _, flist in out_feat_tuples]
+        feats = [f for flist in feat_lists for f in flist]
+        return out, feats
+class MultiscaleDiscrSingleInput(SingleToMultiScaleInputMixin, DiscriminatorMultiToSingleOutputStackedMixin, MultiscaleDiscriminatorSimple):
+    pass
+class MultiscaleResNetSingle(GeneratorMultiToSingleOutputMixin, SingleToMultiScaleInputMixin, MultiscaleResNet):
+    pass

inpaint/saicinpainting/training/modules/pix2pixhd.py ADDED Viewed

	@@ -0,0 +1,669 @@

+# original: https://github.com/NVIDIA/pix2pixHD/blob/master/models/networks.py
+import collections
+from functools import partial
+import functools
+import logging
+from collections import defaultdict
+import numpy as np
+import torch.nn as nn
+from saicinpainting.training.modules.base import BaseDiscriminator, deconv_factory, get_conv_block_ctor, get_norm_layer, get_activation
+from saicinpainting.training.modules.ffc import FFCResnetBlock
+from saicinpainting.training.modules.multidilated_conv import MultidilatedConv
+class DotDict(defaultdict):
+    # https://stackoverflow.com/questions/2352181/how-to-use-a-dot-to-access-members-of-dictionary
+    """dot.notation access to dictionary attributes"""
+    __getattr__ = defaultdict.get
+    __setattr__ = defaultdict.__setitem__
+    __delattr__ = defaultdict.__delitem__
+class Identity(nn.Module):
+    def __init__(self):
+        super().__init__()
+    def forward(self, x):
+        return x
+class ResnetBlock(nn.Module):
+    def __init__(self, dim, padding_type, norm_layer, activation=nn.ReLU(True), use_dropout=False, conv_kind='default',
+                 dilation=1, in_dim=None, groups=1, second_dilation=None):
+        super(ResnetBlock, self).__init__()
+        self.in_dim = in_dim
+        self.dim = dim
+        if second_dilation is None:
+            second_dilation = dilation
+        self.conv_block = self.build_conv_block(dim, padding_type, norm_layer, activation, use_dropout,
+                                                conv_kind=conv_kind, dilation=dilation, in_dim=in_dim, groups=groups,
+                                                second_dilation=second_dilation)
+        if self.in_dim is not None:
+            self.input_conv = nn.Conv2d(in_dim, dim, 1)
+        self.out_channnels = dim
+    def build_conv_block(self, dim, padding_type, norm_layer, activation, use_dropout, conv_kind='default',
+                         dilation=1, in_dim=None, groups=1, second_dilation=1):
+        conv_layer = get_conv_block_ctor(conv_kind)
+        conv_block = []
+        p = 0
+        if padding_type == 'reflect':
+            conv_block += [nn.ReflectionPad2d(dilation)]
+        elif padding_type == 'replicate':
+            conv_block += [nn.ReplicationPad2d(dilation)]
+        elif padding_type == 'zero':
+            p = dilation
+        else:
+            raise NotImplementedError('padding [%s] is not implemented' % padding_type)
+        if in_dim is None:
+            in_dim = dim
+        conv_block += [conv_layer(in_dim, dim, kernel_size=3, padding=p, dilation=dilation),
+                       norm_layer(dim),
+                       activation]
+        if use_dropout:
+            conv_block += [nn.Dropout(0.5)]
+        p = 0
+        if padding_type == 'reflect':
+            conv_block += [nn.ReflectionPad2d(second_dilation)]
+        elif padding_type == 'replicate':
+            conv_block += [nn.ReplicationPad2d(second_dilation)]
+        elif padding_type == 'zero':
+            p = second_dilation
+        else:
+            raise NotImplementedError('padding [%s] is not implemented' % padding_type)
+        conv_block += [conv_layer(dim, dim, kernel_size=3, padding=p, dilation=second_dilation, groups=groups),
+                       norm_layer(dim)]
+        return nn.Sequential(*conv_block)
+    def forward(self, x):
+        x_before = x
+        if self.in_dim is not None:
+            x = self.input_conv(x)
+        out = x + self.conv_block(x_before)
+        return out
+class ResnetBlock5x5(nn.Module):
+    def __init__(self, dim, padding_type, norm_layer, activation=nn.ReLU(True), use_dropout=False, conv_kind='default',
+                 dilation=1, in_dim=None, groups=1, second_dilation=None):
+        super(ResnetBlock5x5, self).__init__()
+        self.in_dim = in_dim
+        self.dim = dim
+        if second_dilation is None:
+            second_dilation = dilation
+        self.conv_block = self.build_conv_block(dim, padding_type, norm_layer, activation, use_dropout,
+                                                conv_kind=conv_kind, dilation=dilation, in_dim=in_dim, groups=groups,
+                                                second_dilation=second_dilation)
+        if self.in_dim is not None:
+            self.input_conv = nn.Conv2d(in_dim, dim, 1)
+        self.out_channnels = dim
+    def build_conv_block(self, dim, padding_type, norm_layer, activation, use_dropout, conv_kind='default',
+                         dilation=1, in_dim=None, groups=1, second_dilation=1):
+        conv_layer = get_conv_block_ctor(conv_kind)
+        conv_block = []
+        p = 0
+        if padding_type == 'reflect':
+            conv_block += [nn.ReflectionPad2d(dilation * 2)]
+        elif padding_type == 'replicate':
+            conv_block += [nn.ReplicationPad2d(dilation * 2)]
+        elif padding_type == 'zero':
+            p = dilation * 2
+        else:
+            raise NotImplementedError('padding [%s] is not implemented' % padding_type)
+        if in_dim is None:
+            in_dim = dim
+        conv_block += [conv_layer(in_dim, dim, kernel_size=5, padding=p, dilation=dilation),
+                       norm_layer(dim),
+                       activation]
+        if use_dropout:
+            conv_block += [nn.Dropout(0.5)]
+        p = 0
+        if padding_type == 'reflect':
+            conv_block += [nn.ReflectionPad2d(second_dilation * 2)]
+        elif padding_type == 'replicate':
+            conv_block += [nn.ReplicationPad2d(second_dilation * 2)]
+        elif padding_type == 'zero':
+            p = second_dilation * 2
+        else:
+            raise NotImplementedError('padding [%s] is not implemented' % padding_type)
+        conv_block += [conv_layer(dim, dim, kernel_size=5, padding=p, dilation=second_dilation, groups=groups),
+                       norm_layer(dim)]
+        return nn.Sequential(*conv_block)
+    def forward(self, x):
+        x_before = x
+        if self.in_dim is not None:
+            x = self.input_conv(x)
+        out = x + self.conv_block(x_before)
+        return out
+class MultidilatedResnetBlock(nn.Module):
+    def __init__(self, dim, padding_type, conv_layer, norm_layer, activation=nn.ReLU(True), use_dropout=False):
+        super().__init__()
+        self.conv_block = self.build_conv_block(dim, padding_type, conv_layer, norm_layer, activation, use_dropout)
+    def build_conv_block(self, dim, padding_type, conv_layer, norm_layer, activation, use_dropout, dilation=1):
+        conv_block = []
+        conv_block += [conv_layer(dim, dim, kernel_size=3, padding_mode=padding_type),
+                       norm_layer(dim),
+                       activation]
+        if use_dropout:
+            conv_block += [nn.Dropout(0.5)]
+        conv_block += [conv_layer(dim, dim, kernel_size=3, padding_mode=padding_type),
+                       norm_layer(dim)]
+        return nn.Sequential(*conv_block)
+    def forward(self, x):
+        out = x + self.conv_block(x)
+        return out
+class MultiDilatedGlobalGenerator(nn.Module):
+    def __init__(self, input_nc, output_nc, ngf=64, n_downsampling=3,
+                 n_blocks=3, norm_layer=nn.BatchNorm2d,
+                 padding_type='reflect', conv_kind='default',
+                 deconv_kind='convtranspose', activation=nn.ReLU(True),
+                 up_norm_layer=nn.BatchNorm2d, affine=None, up_activation=nn.ReLU(True),
+                 add_out_act=True, max_features=1024, multidilation_kwargs={},
+                 ffc_positions=None, ffc_kwargs={}):
+        assert (n_blocks >= 0)
+        super().__init__()
+        conv_layer = get_conv_block_ctor(conv_kind)
+        resnet_conv_layer = functools.partial(get_conv_block_ctor('multidilated'), **multidilation_kwargs)
+        norm_layer = get_norm_layer(norm_layer)
+        if affine is not None:
+            norm_layer = partial(norm_layer, affine=affine)
+        up_norm_layer = get_norm_layer(up_norm_layer)
+        if affine is not None:
+            up_norm_layer = partial(up_norm_layer, affine=affine)
+        model = [nn.ReflectionPad2d(3),
+                 conv_layer(input_nc, ngf, kernel_size=7, padding=0),
+                 norm_layer(ngf),
+                 activation]
+        identity = Identity()
+        ### downsample
+        for i in range(n_downsampling):
+            mult = 2 ** i
+            model += [conv_layer(min(max_features, ngf * mult),
+                                    min(max_features, ngf * mult * 2),
+                                    kernel_size=3, stride=2, padding=1),
+                        norm_layer(min(max_features, ngf * mult * 2)),
+                        activation]
+        mult = 2 ** n_downsampling
+        feats_num_bottleneck = min(max_features, ngf * mult)
+        ### resnet blocks
+        for i in range(n_blocks):
+            if ffc_positions is not None and i in ffc_positions:
+                model += [FFCResnetBlock(feats_num_bottleneck, padding_type, norm_layer, activation_layer=nn.ReLU,
+                                         inline=True, **ffc_kwargs)]
+            model += [MultidilatedResnetBlock(feats_num_bottleneck, padding_type=padding_type,
+                                              conv_layer=resnet_conv_layer, activation=activation,
+                                              norm_layer=norm_layer)]
+        ### upsample
+        for i in range(n_downsampling):
+            mult = 2 ** (n_downsampling - i)
+            model += deconv_factory(deconv_kind, ngf, mult, up_norm_layer, up_activation, max_features)
+        model += [nn.ReflectionPad2d(3),
+                  nn.Conv2d(ngf, output_nc, kernel_size=7, padding=0)]
+        if add_out_act:
+            model.append(get_activation('tanh' if add_out_act is True else add_out_act))
+        self.model = nn.Sequential(*model)
+    def forward(self, input):
+        return self.model(input)
+class ConfigGlobalGenerator(nn.Module):
+    def __init__(self, input_nc, output_nc, ngf=64, n_downsampling=3,
+                 n_blocks=3, norm_layer=nn.BatchNorm2d,
+                 padding_type='reflect', conv_kind='default',
+                 deconv_kind='convtranspose', activation=nn.ReLU(True),
+                 up_norm_layer=nn.BatchNorm2d, affine=None, up_activation=nn.ReLU(True),
+                 add_out_act=True, max_features=1024,
+                 manual_block_spec=[],
+                 resnet_block_kind='multidilatedresnetblock',
+                 resnet_conv_kind='multidilated',
+                 resnet_dilation=1,
+                 multidilation_kwargs={}):
+        assert (n_blocks >= 0)
+        super().__init__()
+        conv_layer = get_conv_block_ctor(conv_kind)
+        resnet_conv_layer = functools.partial(get_conv_block_ctor(resnet_conv_kind), **multidilation_kwargs)
+        norm_layer = get_norm_layer(norm_layer)
+        if affine is not None:
+            norm_layer = partial(norm_layer, affine=affine)
+        up_norm_layer = get_norm_layer(up_norm_layer)
+        if affine is not None:
+            up_norm_layer = partial(up_norm_layer, affine=affine)
+        model = [nn.ReflectionPad2d(3),
+                 conv_layer(input_nc, ngf, kernel_size=7, padding=0),
+                 norm_layer(ngf),
+                 activation]
+        identity = Identity()
+        ### downsample
+        for i in range(n_downsampling):
+            mult = 2 ** i
+            model += [conv_layer(min(max_features, ngf * mult),
+                                    min(max_features, ngf * mult * 2),
+                                    kernel_size=3, stride=2, padding=1),
+                        norm_layer(min(max_features, ngf * mult * 2)),
+                        activation]
+        mult = 2 ** n_downsampling
+        feats_num_bottleneck = min(max_features, ngf * mult)
+        if len(manual_block_spec) == 0:
+            manual_block_spec = [
+                DotDict(lambda : None, {
+                    'n_blocks': n_blocks,
+                    'use_default': True})
+            ]
+        ### resnet blocks
+        for block_spec in manual_block_spec:
+            def make_and_add_blocks(model, block_spec):
+                block_spec = DotDict(lambda : None, block_spec)
+                if not block_spec.use_default:
+                    resnet_conv_layer = functools.partial(get_conv_block_ctor(block_spec.resnet_conv_kind), **block_spec.multidilation_kwargs)
+                    resnet_conv_kind = block_spec.resnet_conv_kind
+                    resnet_block_kind = block_spec.resnet_block_kind
+                    if block_spec.resnet_dilation is not None:
+                        resnet_dilation = block_spec.resnet_dilation
+                for i in range(block_spec.n_blocks):
+                    if resnet_block_kind == "multidilatedresnetblock":
+                        model += [MultidilatedResnetBlock(feats_num_bottleneck, padding_type=padding_type,
+                                                        conv_layer=resnet_conv_layer, activation=activation,
+                                                        norm_layer=norm_layer)]
+                    if resnet_block_kind == "resnetblock":
+                        model += [ResnetBlock(ngf * mult, padding_type=padding_type, activation=activation, norm_layer=norm_layer,
+                                            conv_kind=resnet_conv_kind)]
+                    if resnet_block_kind == "resnetblock5x5":
+                        model += [ResnetBlock5x5(ngf * mult, padding_type=padding_type, activation=activation, norm_layer=norm_layer,
+                                            conv_kind=resnet_conv_kind)]
+                    if resnet_block_kind == "resnetblockdwdil":
+                        model += [ResnetBlock(ngf * mult, padding_type=padding_type, activation=activation, norm_layer=norm_layer,
+                                            conv_kind=resnet_conv_kind, dilation=resnet_dilation, second_dilation=resnet_dilation)]
+            make_and_add_blocks(model, block_spec)
+        ### upsample
+        for i in range(n_downsampling):
+            mult = 2 ** (n_downsampling - i)
+            model += deconv_factory(deconv_kind, ngf, mult, up_norm_layer, up_activation, max_features)
+        model += [nn.ReflectionPad2d(3),
+                  nn.Conv2d(ngf, output_nc, kernel_size=7, padding=0)]
+        if add_out_act:
+            model.append(get_activation('tanh' if add_out_act is True else add_out_act))
+        self.model = nn.Sequential(*model)
+    def forward(self, input):
+        return self.model(input)
+def make_dil_blocks(dilated_blocks_n, dilation_block_kind, dilated_block_kwargs):
+    blocks = []
+    for i in range(dilated_blocks_n):
+        if dilation_block_kind == 'simple':
+            blocks.append(ResnetBlock(**dilated_block_kwargs, dilation=2 ** (i + 1)))
+        elif dilation_block_kind == 'multi':
+            blocks.append(MultidilatedResnetBlock(**dilated_block_kwargs))
+        else:
+            raise ValueError(f'dilation_block_kind could not be "{dilation_block_kind}"')
+    return blocks
+class GlobalGenerator(nn.Module):
+    def __init__(self, input_nc, output_nc, ngf=64, n_downsampling=3, n_blocks=9, norm_layer=nn.BatchNorm2d,
+                 padding_type='reflect', conv_kind='default', activation=nn.ReLU(True),
+                 up_norm_layer=nn.BatchNorm2d, affine=None,
+                 up_activation=nn.ReLU(True), dilated_blocks_n=0, dilated_blocks_n_start=0,
+                 dilated_blocks_n_middle=0,
+                 add_out_act=True,
+                 max_features=1024, is_resblock_depthwise=False,
+                 ffc_positions=None, ffc_kwargs={}, dilation=1, second_dilation=None,
+                 dilation_block_kind='simple', multidilation_kwargs={}):
+        assert (n_blocks >= 0)
+        super().__init__()
+        conv_layer = get_conv_block_ctor(conv_kind)
+        norm_layer = get_norm_layer(norm_layer)
+        if affine is not None:
+            norm_layer = partial(norm_layer, affine=affine)
+        up_norm_layer = get_norm_layer(up_norm_layer)
+        if affine is not None:
+            up_norm_layer = partial(up_norm_layer, affine=affine)
+        if ffc_positions is not None:
+            ffc_positions = collections.Counter(ffc_positions)
+        model = [nn.ReflectionPad2d(3),
+                 conv_layer(input_nc, ngf, kernel_size=7, padding=0),
+                 norm_layer(ngf),
+                 activation]
+        identity = Identity()
+        ### downsample
+        for i in range(n_downsampling):
+            mult = 2 ** i
+            model += [conv_layer(min(max_features, ngf * mult),
+                                min(max_features, ngf * mult * 2),
+                                kernel_size=3, stride=2, padding=1),
+                        norm_layer(min(max_features, ngf * mult * 2)),
+                        activation]
+        mult = 2 ** n_downsampling
+        feats_num_bottleneck = min(max_features, ngf * mult)
+        dilated_block_kwargs = dict(dim=feats_num_bottleneck, padding_type=padding_type,
+                                    activation=activation, norm_layer=norm_layer)
+        if dilation_block_kind == 'simple':
+            dilated_block_kwargs['conv_kind'] = conv_kind
+        elif dilation_block_kind == 'multi':
+            dilated_block_kwargs['conv_layer'] = functools.partial(
+                get_conv_block_ctor('multidilated'), **multidilation_kwargs)
+        # dilated blocks at the start of the bottleneck sausage
+        if dilated_blocks_n_start is not None and dilated_blocks_n_start > 0:
+            model += make_dil_blocks(dilated_blocks_n_start, dilation_block_kind, dilated_block_kwargs)
+        # resnet blocks
+        for i in range(n_blocks):
+            # dilated blocks at the middle of the bottleneck sausage
+            if i == n_blocks // 2 and dilated_blocks_n_middle is not None and dilated_blocks_n_middle > 0:
+                model += make_dil_blocks(dilated_blocks_n_middle, dilation_block_kind, dilated_block_kwargs)
+            if ffc_positions is not None and i in ffc_positions:
+                for _ in range(ffc_positions[i]):  # same position can occur more than once
+                    model += [FFCResnetBlock(feats_num_bottleneck, padding_type, norm_layer, activation_layer=nn.ReLU,
+                                             inline=True, **ffc_kwargs)]
+            if is_resblock_depthwise:
+                resblock_groups = feats_num_bottleneck
+            else:
+                resblock_groups = 1
+            model += [ResnetBlock(feats_num_bottleneck, padding_type=padding_type, activation=activation,
+                                    norm_layer=norm_layer, conv_kind=conv_kind, groups=resblock_groups,
+                                    dilation=dilation, second_dilation=second_dilation)]
+        # dilated blocks at the end of the bottleneck sausage
+        if dilated_blocks_n is not None and dilated_blocks_n > 0:
+            model += make_dil_blocks(dilated_blocks_n, dilation_block_kind, dilated_block_kwargs)
+        # upsample
+        for i in range(n_downsampling):
+            mult = 2 ** (n_downsampling - i)
+            model += [nn.ConvTranspose2d(min(max_features, ngf * mult),
+                                         min(max_features, int(ngf * mult / 2)),
+                                         kernel_size=3, stride=2, padding=1, output_padding=1),
+                      up_norm_layer(min(max_features, int(ngf * mult / 2))),
+                      up_activation]
+        model += [nn.ReflectionPad2d(3),
+                  nn.Conv2d(ngf, output_nc, kernel_size=7, padding=0)]
+        if add_out_act:
+            model.append(get_activation('tanh' if add_out_act is True else add_out_act))
+        self.model = nn.Sequential(*model)
+    def forward(self, input):
+        return self.model(input)
+class GlobalGeneratorGated(GlobalGenerator):
+    def __init__(self, *args, **kwargs):
+        real_kwargs=dict(
+            conv_kind='gated_bn_relu',
+            activation=nn.Identity(),
+            norm_layer=nn.Identity
+        )
+        real_kwargs.update(kwargs)
+        super().__init__(*args, **real_kwargs)
+class GlobalGeneratorFromSuperChannels(nn.Module):
+    def __init__(self, input_nc, output_nc, n_downsampling, n_blocks, super_channels, norm_layer="bn", padding_type='reflect', add_out_act=True):
+        super().__init__()
+        self.n_downsampling = n_downsampling
+        norm_layer = get_norm_layer(norm_layer)
+        if type(norm_layer) == functools.partial:
+            use_bias = (norm_layer.func == nn.InstanceNorm2d)
+        else:
+            use_bias = (norm_layer == nn.InstanceNorm2d)
+        channels = self.convert_super_channels(super_channels)
+        self.channels = channels
+        model = [nn.ReflectionPad2d(3),
+                 nn.Conv2d(input_nc, channels[0], kernel_size=7, padding=0, bias=use_bias),
+                 norm_layer(channels[0]),
+                 nn.ReLU(True)]
+        for i in range(n_downsampling):  # add downsampling layers
+            mult = 2 ** i
+            model += [nn.Conv2d(channels[0+i], channels[1+i], kernel_size=3, stride=2, padding=1, bias=use_bias),
+                      norm_layer(channels[1+i]),
+                      nn.ReLU(True)]
+        mult = 2 ** n_downsampling
+        n_blocks1 = n_blocks // 3
+        n_blocks2 = n_blocks1
+        n_blocks3 = n_blocks - n_blocks1 - n_blocks2
+        for i in range(n_blocks1):
+            c = n_downsampling
+            dim = channels[c]
+            model += [ResnetBlock(dim, padding_type=padding_type, norm_layer=norm_layer)]
+        for i in range(n_blocks2):
+            c = n_downsampling+1
+            dim = channels[c]
+            kwargs = {}
+            if i == 0:
+                kwargs = {"in_dim": channels[c-1]}
+            model += [ResnetBlock(dim, padding_type=padding_type, norm_layer=norm_layer, **kwargs)]
+        for i in range(n_blocks3):
+            c = n_downsampling+2
+            dim = channels[c]
+            kwargs = {}
+            if i == 0:
+                kwargs = {"in_dim": channels[c-1]}
+            model += [ResnetBlock(dim, padding_type=padding_type, norm_layer=norm_layer, **kwargs)]
+        for i in range(n_downsampling):  # add upsampling layers
+            mult = 2 ** (n_downsampling - i)
+            model += [nn.ConvTranspose2d(channels[n_downsampling+3+i],
+                                           channels[n_downsampling+3+i+1],
+                                           kernel_size=3, stride=2,
+                                           padding=1, output_padding=1,
+                                           bias=use_bias),
+                      norm_layer(channels[n_downsampling+3+i+1]),
+                      nn.ReLU(True)]
+        model += [nn.ReflectionPad2d(3)]
+        model += [nn.Conv2d(channels[2*n_downsampling+3], output_nc, kernel_size=7, padding=0)]
+        if add_out_act:
+            model.append(get_activation('tanh' if add_out_act is True else add_out_act))
+        self.model = nn.Sequential(*model)
+    def convert_super_channels(self, super_channels):
+        n_downsampling = self.n_downsampling
+        result = []
+        cnt = 0
+        if n_downsampling == 2:
+            N1 = 10
+        elif n_downsampling == 3:
+            N1 = 13
+        else:
+            raise NotImplementedError
+        for i in range(0, N1):
+            if i in [1,4,7,10]:
+                channel = super_channels[cnt] * (2 ** cnt)
+                config = {'channel': channel}
+                result.append(channel)
+                logging.info(f"Downsample channels {result[-1]}")
+                cnt += 1
+        for i in range(3):
+            for counter, j in enumerate(range(N1 + i * 3, N1 + 3 + i * 3)):
+                if len(super_channels) == 6:
+                    channel = super_channels[3] * 4
+                else:
+                    channel = super_channels[i + 3] * 4
+                config = {'channel': channel}
+                if counter == 0:
+                    result.append(channel)
+                    logging.info(f"Bottleneck channels {result[-1]}")
+        cnt = 2
+        for i in range(N1+9, N1+21):
+            if i in [22, 25,28]:
+                cnt -= 1
+                if len(super_channels) == 6:
+                    channel = super_channels[5 - cnt] * (2 ** cnt)
+                else:
+                    channel = super_channels[7 - cnt] * (2 ** cnt)
+                result.append(int(channel))
+                logging.info(f"Upsample channels {result[-1]}")
+        return result
+    def forward(self, input):
+        return self.model(input)
+# Defines the PatchGAN discriminator with the specified arguments.
+class NLayerDiscriminator(BaseDiscriminator):
+    def __init__(self, input_nc, ndf=64, n_layers=3, norm_layer=nn.BatchNorm2d,):
+        super().__init__()
+        self.n_layers = n_layers
+        kw = 4
+        padw = int(np.ceil((kw-1.0)/2))
+        sequence = [[nn.Conv2d(input_nc, ndf, kernel_size=kw, stride=2, padding=padw),
+                     nn.LeakyReLU(0.2, True)]]
+        nf = ndf
+        for n in range(1, n_layers):
+            nf_prev = nf
+            nf = min(nf * 2, 512)
+            cur_model = []
+            cur_model += [
+                nn.Conv2d(nf_prev, nf, kernel_size=kw, stride=2, padding=padw),
+                norm_layer(nf),
+                nn.LeakyReLU(0.2, True)
+            ]
+            sequence.append(cur_model)
+        nf_prev = nf
+        nf = min(nf * 2, 512)
+        cur_model = []
+        cur_model += [
+            nn.Conv2d(nf_prev, nf, kernel_size=kw, stride=1, padding=padw),
+            norm_layer(nf),
+            nn.LeakyReLU(0.2, True)
+        ]
+        sequence.append(cur_model)
+        sequence += [[nn.Conv2d(nf, 1, kernel_size=kw, stride=1, padding=padw)]]
+        for n in range(len(sequence)):
+            setattr(self, 'model'+str(n), nn.Sequential(*sequence[n]))
+    def get_all_activations(self, x):
+        res = [x]
+        for n in range(self.n_layers + 2):
+            model = getattr(self, 'model' + str(n))
+            res.append(model(res[-1]))
+        return res[1:]
+    def forward(self, x):
+        act = self.get_all_activations(x)
+        return act[-1], act[:-1]
+class MultidilatedNLayerDiscriminator(BaseDiscriminator):
+    def __init__(self, input_nc, ndf=64, n_layers=3, norm_layer=nn.BatchNorm2d, multidilation_kwargs={}):
+        super().__init__()
+        self.n_layers = n_layers
+        kw = 4
+        padw = int(np.ceil((kw-1.0)/2))
+        sequence = [[nn.Conv2d(input_nc, ndf, kernel_size=kw, stride=2, padding=padw),
+                     nn.LeakyReLU(0.2, True)]]
+        nf = ndf
+        for n in range(1, n_layers):
+            nf_prev = nf
+            nf = min(nf * 2, 512)
+            cur_model = []
+            cur_model += [
+                MultidilatedConv(nf_prev, nf, kernel_size=kw, stride=2, padding=[2, 3], **multidilation_kwargs),
+                norm_layer(nf),
+                nn.LeakyReLU(0.2, True)
+            ]
+            sequence.append(cur_model)
+        nf_prev = nf
+        nf = min(nf * 2, 512)
+        cur_model = []
+        cur_model += [
+            nn.Conv2d(nf_prev, nf, kernel_size=kw, stride=1, padding=padw),
+            norm_layer(nf),
+            nn.LeakyReLU(0.2, True)
+        ]
+        sequence.append(cur_model)
+        sequence += [[nn.Conv2d(nf, 1, kernel_size=kw, stride=1, padding=padw)]]
+        for n in range(len(sequence)):
+            setattr(self, 'model'+str(n), nn.Sequential(*sequence[n]))
+    def get_all_activations(self, x):
+        res = [x]
+        for n in range(self.n_layers + 2):
+            model = getattr(self, 'model' + str(n))
+            res.append(model(res[-1]))
+        return res[1:]
+    def forward(self, x):
+        act = self.get_all_activations(x)
+        return act[-1], act[:-1]
+class NLayerDiscriminatorAsGen(NLayerDiscriminator):
+    def forward(self, x):
+        return super().forward(x)[0]

inpaint/saicinpainting/training/modules/spatial_transform.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from kornia.geometry.transform import rotate
+class LearnableSpatialTransformWrapper(nn.Module):
+    def __init__(self, impl, pad_coef=0.5, angle_init_range=80, train_angle=True):
+        super().__init__()
+        self.impl = impl
+        self.angle = torch.rand(1) * angle_init_range
+        if train_angle:
+            self.angle = nn.Parameter(self.angle, requires_grad=True)
+        self.pad_coef = pad_coef
+    def forward(self, x):
+        if torch.is_tensor(x):
+            return self.inverse_transform(self.impl(self.transform(x)), x)
+        elif isinstance(x, tuple):
+            x_trans = tuple(self.transform(elem) for elem in x)
+            y_trans = self.impl(x_trans)
+            return tuple(self.inverse_transform(elem, orig_x) for elem, orig_x in zip(y_trans, x))
+        else:
+            raise ValueError(f'Unexpected input type {type(x)}')
+    def transform(self, x):
+        height, width = x.shape[2:]
+        pad_h, pad_w = int(height * self.pad_coef), int(width * self.pad_coef)
+        x_padded = F.pad(x, [pad_w, pad_w, pad_h, pad_h], mode='reflect')
+        x_padded_rotated = rotate(x_padded, angle=self.angle.to(x_padded))
+        return x_padded_rotated
+    def inverse_transform(self, y_padded_rotated, orig_x):
+        height, width = orig_x.shape[2:]
+        pad_h, pad_w = int(height * self.pad_coef), int(width * self.pad_coef)
+        y_padded = rotate(y_padded_rotated, angle=-self.angle.to(y_padded_rotated))
+        y_height, y_width = y_padded.shape[2:]
+        y = y_padded[:, :, pad_h : y_height - pad_h, pad_w : y_width - pad_w]
+        return y
+if __name__ == '__main__':
+    layer = LearnableSpatialTransformWrapper(nn.Identity())
+    x = torch.arange(2* 3 * 15 * 15).view(2, 3, 15, 15).float()
+    y = layer(x)
+    assert x.shape == y.shape
+    assert torch.allclose(x[:, :, 1:, 1:][:, :, :-1, :-1], y[:, :, 1:, 1:][:, :, :-1, :-1])
+    print('all ok')

inpaint/saicinpainting/training/modules/squeeze_excitation.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import torch.nn as nn
+class SELayer(nn.Module):
+    def __init__(self, channel, reduction=16):
+        super(SELayer, self).__init__()
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.fc = nn.Sequential(
+            nn.Linear(channel, channel // reduction, bias=False),
+            nn.ReLU(inplace=True),
+            nn.Linear(channel // reduction, channel, bias=False),
+            nn.Sigmoid()
+        )
+    def forward(self, x):
+        b, c, _, _ = x.size()
+        y = self.avg_pool(x).view(b, c)
+        y = self.fc(y).view(b, c, 1, 1)
+        res = x * y.expand_as(x)
+        return res

inpaint/saicinpainting/training/trainers/__init__.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import torch
+from saicinpainting.training.trainers.default import DefaultInpaintingTrainingModule
+def get_training_model_class(kind):
+    if kind == 'default':
+        return DefaultInpaintingTrainingModule
+    raise ValueError(f'Unknown trainer module {kind}')
+def make_training_model(config):
+    kind = config.training_model.kind
+    kwargs = dict(config.training_model)
+    kwargs.pop('kind')
+    kwargs['use_ddp'] = config.trainer.kwargs.get('accelerator', None) == 'ddp'
+    cls = get_training_model_class(kind)
+    return cls(config, **kwargs)
+def load_checkpoint(train_config, path, map_location='cuda', strict=True):
+    model: torch.nn.Module = make_training_model(train_config)
+    state = torch.load(path, map_location=map_location)
+    model.load_state_dict(state['state_dict'], strict=strict)
+    model.on_load_checkpoint(state)
+    return model

inpaint/saicinpainting/training/trainers/base.py ADDED Viewed

	@@ -0,0 +1,19 @@

+import pytorch_lightning as ptl
+from inpaint.saicinpainting.training.modules import make_generator
+class BaseInpaintingTrainingModule(ptl.LightningModule):
+    def __init__(self, config, use_ddp, *args,  predict_only=False, visualize_each_iters=100,
+                 average_generator=False, generator_avg_beta=0.999, average_generator_start_step=30000,
+                 average_generator_period=10, store_discr_outputs_for_vis=False,
+                 **kwargs):
+        super().__init__(*args, **kwargs)
+        self.config = config
+        self.generator = make_generator(config, **self.config.generator)
+        self.use_ddp = use_ddp
+        self.visualize_each_iters = visualize_each_iters

inpaint/saicinpainting/training/trainers/default.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import torch
+from saicinpainting.training.trainers.base import BaseInpaintingTrainingModule
+class DefaultInpaintingTrainingModule(BaseInpaintingTrainingModule):
+    def __init__(self, *args, concat_mask=True, rescale_scheduler_kwargs=None, image_to_discriminator='predicted_image',
+                 add_noise_kwargs=None, noise_fill_hole=False, const_area_crop_kwargs=None,
+                 distance_weighter_kwargs=None, distance_weighted_mask_for_discr=False,
+                 fake_fakes_proba=0, fake_fakes_generator_kwargs=None,
+                 **kwargs):
+        super().__init__(*args, **kwargs)
+        self.concat_mask = concat_mask
+        self.image_to_discriminator = image_to_discriminator
+        self.add_noise_kwargs = add_noise_kwargs
+        self.noise_fill_hole = noise_fill_hole
+        self.const_area_crop_kwargs = const_area_crop_kwargs
+        # print(distance_weighter_kwargs)
+        self.refine_mask_for_losses =  None
+        self.distance_weighted_mask_for_discr = distance_weighted_mask_for_discr
+        self.fake_fakes_proba = fake_fakes_proba
+    def forward(self, batch):
+        img = batch['image']
+        mask = batch['mask']
+        masked_img = img * (1 - mask)
+        if self.concat_mask:
+            masked_img = torch.cat([masked_img, mask], dim=1)
+        batch['predicted_image'] = self.generator(masked_img)
+        batch['inpainted'] = mask * batch['predicted_image'] + (1 - mask) * batch['image']
+        if self.fake_fakes_proba > 1e-3:
+            if self.training and torch.rand(1).item() < self.fake_fakes_proba:
+                batch['fake_fakes'], batch['fake_fakes_masks'] = self.fake_fakes_gen(img, mask)
+                batch['use_fake_fakes'] = True
+            else:
+                batch['fake_fakes'] = torch.zeros_like(img)
+                batch['fake_fakes_masks'] = torch.zeros_like(mask)
+                batch['use_fake_fakes'] = False
+        batch['mask_for_losses'] = self.refine_mask_for_losses(img, batch['predicted_image'], mask) \
+            if self.refine_mask_for_losses is not None and self.training \
+            else mask
+        return batch

sod/PGNet.py ADDED Viewed

	@@ -0,0 +1,270 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .Res import resnet18
+from .Swin import Swintransformer
+Act = nn.ReLU
+def weight_init(module):
+    for n, m in module.named_children():
+        if isinstance(m, nn.Conv2d):
+            nn.init.kaiming_normal_(m.weight, mode='fan_in', nonlinearity='relu')
+            if m.bias is not None:
+                nn.init.zeros_(m.bias)
+        elif isinstance(m, (nn.BatchNorm2d, nn.InstanceNorm2d,nn.BatchNorm1d)):
+            nn.init.ones_(m.weight)
+            if m.bias is not None:
+                nn.init.zeros_(m.bias)
+        elif isinstance(m, nn.Linear):
+            nn.init.kaiming_normal_(m.weight, mode='fan_in', nonlinearity='relu')
+            if m.bias is not None:
+                nn.init.zeros_(m.bias)
+        elif isinstance(m, nn.Sequential):
+            weight_init(m)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, (nn.ReLU,Act,nn.AdaptiveAvgPool2d,nn.Softmax)):
+            pass
+        else:
+            m.initialize()
+class Grafting(nn.Module):
+    def __init__(self, dim, num_heads=8, qkv_bias=True, qk_scale=None):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+        self.k = nn.Linear(dim, dim , bias=qkv_bias)
+        self.qv = nn.Linear(dim, dim * 2, bias=qkv_bias)
+        self.proj = nn.Linear(dim, dim)
+        self.act = nn.ReLU(inplace=True)
+        self.conv = nn.Conv2d(8,8,kernel_size=3, stride=1, padding=1)
+        self.lnx = nn.LayerNorm(64)
+        self.lny = nn.LayerNorm(64)
+        self.bn = nn.BatchNorm2d(8)
+        self.conv2 = nn.Sequential(
+            nn.Conv2d(64,64,kernel_size=3, stride=1, padding=1),
+            nn.BatchNorm2d(64),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(64,64,kernel_size=3, stride=1, padding=1),
+            nn.BatchNorm2d(64),
+            nn.ReLU(inplace=True)
+        )
+    def forward(self, x, y):
+        batch_size = x.shape[0]
+        chanel     = x.shape[1]
+        sc = x
+        x = x.view(batch_size, chanel, -1).permute(0, 2, 1)
+        sc1 = x
+        x = self.lnx(x)
+        y = y.view(batch_size, chanel, -1).permute(0, 2, 1)
+        y = self.lny(y)
+        B, N, C = x.shape
+        y_k = self.k(y).reshape(B, N, 1, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        x_qv= self.qv(x).reshape(B,N,2,self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        x_q, x_v = x_qv[0], x_qv[1]
+        y_k = y_k[0]
+        attn = (x_q @ y_k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        x = (attn @ x_v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = (x+sc1)
+        x = x.permute(0,2,1)
+        x = x.view(batch_size,chanel,*sc.size()[2:])
+        x = self.conv2(x)+x
+        return x,self.act(self.bn(self.conv(attn+attn.transpose(-1,-2))))
+    def initialize(self):
+        weight_init(self)
+class DB1(nn.Module):
+    def __init__(self,inplanes,outplanes):
+        super(DB1,self).__init__()
+        self.squeeze1 = nn.Sequential(
+                    nn.Conv2d(inplanes, outplanes,kernel_size=1,stride=1,padding=0),
+                    nn.BatchNorm2d(64),
+                    nn.ReLU(inplace=True)
+                )
+        self.squeeze2 = nn.Sequential(
+                nn.Conv2d(64, 64, kernel_size=3,stride=1,dilation=2,padding=2),
+                nn.BatchNorm2d(64),
+                nn.ReLU(inplace=True)
+                )
+    def forward(self, x):
+        z = self.squeeze2(self.squeeze1(x))
+        return z,z
+    def initialize(self):
+        weight_init(self)
+class DB2(nn.Module):
+    def __init__(self,inplanes,outplanes):
+        super(DB2,self).__init__()
+        self.short_cut = nn.Conv2d(outplanes, outplanes, kernel_size=1, stride=1, padding=0)
+        self.conv = nn.Sequential(
+            nn.Conv2d(inplanes+outplanes,outplanes,kernel_size=3, stride=1, padding=1),
+            nn.BatchNorm2d(outplanes),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(outplanes,outplanes,kernel_size=3, stride=1, padding=1),
+            nn.BatchNorm2d(outplanes),
+            nn.ReLU(inplace=True)
+        )
+        self.conv2 = nn.Sequential(
+            nn.Conv2d(outplanes,outplanes,kernel_size=3, stride=1, padding=1),
+            nn.BatchNorm2d(outplanes),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(outplanes,outplanes,kernel_size=3, stride=1, padding=1),
+            nn.BatchNorm2d(outplanes),
+            nn.ReLU(inplace=True)
+        )
+    def forward(self,x,z):
+        z = F.interpolate(z,size=x.size()[2:],mode='bilinear',align_corners=True)
+        p = self.conv(torch.cat((x,z),1))
+        sc = self.short_cut(z)
+        p  = p+sc
+        p2 = self.conv2(p)
+        p  = p+p2
+        return p,p
+    def initialize(self):
+        weight_init(self)
+class DB3(nn.Module):
+    def __init__(self) -> None:
+        super(DB3,self).__init__()
+        self.db2 = DB2(64,64)
+        self.conv3x3 = nn.Sequential(
+            nn.Conv2d(64,64,kernel_size=3, stride=1, padding=1),
+            nn.BatchNorm2d(64),
+            nn.ReLU(inplace=True)
+        )
+        self.sqz_r4 = nn.Sequential(
+            nn.Conv2d(256, 64, kernel_size=3,stride=1,dilation=1,padding=1),
+            nn.BatchNorm2d(64),
+            nn.ReLU(inplace=True)
+            )
+        self.sqz_s1=nn.Sequential(
+            nn.Conv2d(128, 64, kernel_size=3,stride=1,dilation=1,padding=1),
+            nn.BatchNorm2d(64),
+            nn.ReLU(inplace=True)
+            )
+    def forward(self,s,r,up):
+        up = F.interpolate(up,size=s.size()[2:],mode='bilinear',align_corners=True)
+        s = self.sqz_s1(s)
+        r = self.sqz_r4(r)
+        sr = self.conv3x3(s+r)
+        out,_  =self.db2(sr,up)
+        return out,out
+    def initialize(self):
+        weight_init(self)
+class decoder(nn.Module):
+    def __init__(self) -> None:
+        super(decoder,self).__init__()
+        self.sqz_s2=nn.Sequential(
+            nn.Conv2d(256, 64, kernel_size=3,stride=1,dilation=1,padding=1),
+            nn.BatchNorm2d(64),
+            nn.ReLU(inplace=True)
+            )
+        self.sqz_r5 = nn.Sequential(
+            nn.Conv2d(512, 64, kernel_size=3,stride=1,dilation=1,padding=1),
+            nn.BatchNorm2d(64),
+            nn.ReLU(inplace=True)
+            )
+        self.GF   = Grafting(64,num_heads=8)
+        self.d1 = DB1(512,64)
+        self.d2 = DB2(512,64)
+        self.d3 = DB2(64,64)
+        self.d4 = DB3()
+        self.d5 = DB2(128,64)
+        self.d6 = DB2(64,64)
+    def forward(self,s1,s2,s3,s4,r2,r3,r4,r5):
+        r5 = F.interpolate(r5,size = s2.size()[2:],mode='bilinear',align_corners=True)
+        s1 = F.interpolate(s1,size = r4.size()[2:],mode='bilinear',align_corners=True)
+        s4_,_ = self.d1(s4)
+        s3_,_ = self.d2(s3,s4_)
+        s2_ = self.sqz_s2(s2)
+        r5_= self.sqz_r5(r5)
+        graft_feature_r5,cam = self.GF(r5_,s2_)
+        graft_feature_r5_,_=self.d3(graft_feature_r5,s3_)
+        graft_feature_r4,_=self.d4(s1,r4,graft_feature_r5_)
+        r3_,_ = self.d5(r3,graft_feature_r4)
+        r2_,_ = self.d6(r2,r3_)
+        return r2_,cam,r5_,s2_
+    def initialize(self):
+        weight_init(self)
+class PGNet(nn.Module):
+    def __init__(self, cfg=None):
+        super(PGNet, self).__init__()
+        self.cfg      = cfg
+        self.decoder  = decoder()
+        self.linear1 = nn.Conv2d(64, 1, kernel_size=3, stride=1, padding=1)
+        self.linear2 = nn.Conv2d(64, 1, kernel_size=3, stride=1, padding=1)
+        self.linear3 = nn.Conv2d(64, 1, kernel_size=3, stride=1, padding=1)
+        self.conv = nn.Conv2d(8,1,kernel_size=3, stride=1, padding=1)
+        if self.cfg is None or self.cfg.snapshot is None:
+            weight_init(self)
+        self.resnet    = resnet18()
+        self.swin      = Swintransformer(224)
+        device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
+        self.swin.load_state_dict(torch.load('sod/weights/swin224.pth', map_location=device)['model'],strict=False)
+        self.resnet.load_state_dict(torch.load('sod/weights/resnet18.pth', map_location=device),strict=False)
+        if self.cfg is not None and self.cfg.snapshot:
+            print('load checkpoint')
+            pretrain=torch.load(self.cfg.snapshot, map_location=device)
+            new_state_dict = {}
+            for k,v in pretrain.items():
+                new_state_dict[k[7:]] = v
+            self.load_state_dict(new_state_dict, strict=False)
+    def forward(self, x,shape=None,mask=None):
+        shape = x.size()[2:] if shape is None else shape
+        y = F.interpolate(x, size=(224,224), mode='bilinear',align_corners=True)
+        r2,r3,r4,r5 = self.resnet(x)
+        s1,s2,s3,s4 = self.swin(y)
+        r2_,attmap,r5_,s2_ = self.decoder(s1,s2,s3,s4,r2,r3,r4,r5)
+        pred1 = F.interpolate(self.linear1(r2_), size=shape, mode='bilinear')
+        wr = F.interpolate(self.linear2(r5_), size=(28,28), mode='bilinear')
+        ws = F.interpolate(self.linear3(s2_), size=(28,28), mode='bilinear')
+        return pred1,wr,ws,self.conv(attmap)

sod/Res.py ADDED Viewed

	@@ -0,0 +1,363 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+__all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101',
+           'resnet152', 'resnext50_32x4d', 'resnext101_32x8d',
+           'wide_resnet50_2', 'wide_resnet101_2']
+model_urls = {
+    'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
+    'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
+    'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
+    'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
+    'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
+    'resnext50_32x4d': 'https://download.pytorch.org/models/resnext50_32x4d-7cdf4587.pth',
+    'resnext101_32x8d': 'https://download.pytorch.org/models/resnext101_32x8d-8ba56ff5.pth',
+    'wide_resnet50_2': 'https://download.pytorch.org/models/wide_resnet50_2-95faca4d.pth',
+    'wide_resnet101_2': 'https://download.pytorch.org/models/wide_resnet101_2-32ee1156.pth',
+}
+def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
+    """3x3 convolution with padding"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=dilation, groups=groups, bias=False, dilation=dilation)
+def conv1x1(in_planes, out_planes, stride=1):
+    """1x1 convolution"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
+class BasicBlock(nn.Module):
+    expansion = 1
+    def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
+                 base_width=64, dilation=1, norm_layer=None):
+        super(BasicBlock, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        if groups != 1 or base_width != 64:
+            raise ValueError('BasicBlock only supports groups=1 and base_width=64')
+        if dilation > 1:
+            raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
+        # Both self.conv1 and self.downsample layers downsample the input when stride != 1
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = norm_layer(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = norm_layer(planes)
+        self.downsample = downsample
+        self.stride = stride
+    def forward(self, x):
+        identity = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        if self.downsample is not None:
+            identity = self.downsample(x)
+        out += identity
+        out = self.relu(out)
+        return out
+    def initialize(self):
+        weight_init(self)
+class Bottleneck(nn.Module):
+    # Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2)
+    # while original implementation places the stride at the first 1x1 convolution(self.conv1)
+    # according to "Deep residual learning for image recognition"https://arxiv.org/abs/1512.03385.
+    # This variant is also known as ResNet V1.5 and improves accuracy according to
+    # https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch.
+    expansion = 4
+    def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
+                 base_width=64, dilation=1, norm_layer=None):
+        super(Bottleneck, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        width = int(planes * (base_width / 64.)) * groups
+        # Both self.conv2 and self.downsample layers downsample the input when stride != 1
+        self.conv1 = conv1x1(inplanes, width)
+        self.bn1 = norm_layer(width)
+        self.conv2 = conv3x3(width, width, stride, groups, dilation)
+        self.bn2 = norm_layer(width)
+        self.conv3 = conv1x1(width, planes * self.expansion)
+        self.bn3 = norm_layer(planes * self.expansion)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+    def forward(self, x):
+        identity = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+        out = self.conv3(out)
+        out = self.bn3(out)
+        if self.downsample is not None:
+            identity = self.downsample(x)
+        out += identity
+        out = self.relu(out)
+        return out
+    def initialize(self):
+        weight_init(self)
+def weight_init(module):
+    for n, m in module.named_children():
+      #  print('initialize: '+n)
+        if isinstance(m, nn.Conv2d):
+            nn.init.kaiming_normal_(m.weight, mode='fan_in', nonlinearity='relu')
+            if m.bias is not None:
+                nn.init.zeros_(m.bias)
+        elif isinstance(m, (nn.BatchNorm2d, nn.InstanceNorm2d)):
+            nn.init.ones_(m.weight)
+            if m.bias is not None:
+                nn.init.zeros_(m.bias)
+        elif isinstance(m, nn.Linear):
+            nn.init.kaiming_normal_(m.weight, mode='fan_in', nonlinearity='relu')
+            if m.bias is not None:
+                nn.init.zeros_(m.bias)
+        elif isinstance(m, nn.Sequential):
+            weight_init(m)
+        elif isinstance(m, (nn.ReLU,nn.AdaptiveAvgPool2d,nn.Softmax,nn.MaxPool2d)):
+            pass
+        else:
+            m.initialize()
+class ResNet(nn.Module):
+    def __init__(self, block, layers, num_classes=1000, zero_init_residual=False,
+                 groups=1, width_per_group=64, replace_stride_with_dilation=None,
+                 norm_layer=None):
+        super(ResNet, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        self._norm_layer = norm_layer
+        self.inplanes = 64
+        self.dilation = 1
+        if replace_stride_with_dilation is None:
+            # each element in the tuple indicates if we should replace
+            # the 2x2 stride with a dilated convolution instead
+            replace_stride_with_dilation = [False, False, False]
+        if len(replace_stride_with_dilation) != 3:
+            raise ValueError("replace_stride_with_dilation should be None "
+                             "or a 3-element tuple, got {}".format(replace_stride_with_dilation))
+        self.groups = groups
+        self.base_width = width_per_group
+        self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3,
+                               bias=False)
+        self.bn1 = norm_layer(self.inplanes)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2,
+                                       dilate=replace_stride_with_dilation[0])
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2,
+                                       dilate=replace_stride_with_dilation[1])
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=2,
+                                       dilate=replace_stride_with_dilation[2])
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+        # Zero-initialize the last BN in each residual branch,
+        # so that the residual branch starts with zeros, and each residual block behaves like an identity.
+        # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
+        if zero_init_residual:
+            for m in self.modules():
+                if isinstance(m, Bottleneck):
+                    nn.init.constant_(m.bn3.weight, 0)
+                elif isinstance(m, BasicBlock):
+                    nn.init.constant_(m.bn2.weight, 0)
+    def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
+        norm_layer = self._norm_layer
+        downsample = None
+        previous_dilation = self.dilation
+        if dilate:
+            self.dilation *= stride
+            stride = 1
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                conv1x1(self.inplanes, planes * block.expansion, stride),
+                norm_layer(planes * block.expansion),
+            )
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, downsample, self.groups,
+                            self.base_width, previous_dilation, norm_layer))
+        self.inplanes = planes * block.expansion
+        for _ in range(1, blocks):
+            layers.append(block(self.inplanes, planes, groups=self.groups,
+                                base_width=self.base_width, dilation=self.dilation,
+                                norm_layer=norm_layer))
+        return nn.Sequential(*layers)
+    def forward(self, x):
+        out1 = F.relu(self.bn1(self.conv1(x)),inplace=True)
+        out1 = self.maxpool(out1)
+        out2 = self.layer1(out1)
+        out3 = self.layer2(out2)
+        out4 = self.layer3(out3)
+        out5 = self.layer4(out4)
+        return out2, out3, out4, out5
+    def initialize(self):
+        weight_init(self)
+def _resnet(arch, block, layers, pretrained, progress, **kwargs):
+    model = ResNet(block, layers, **kwargs)
+    return model
+def resnet18(pretrained=False, progress=True, **kwargs):
+    r"""ResNet-18 model from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _resnet('resnet18', BasicBlock, [2, 2, 2, 2], pretrained, progress,
+                   **kwargs)
+def resnet34(pretrained=False, progress=True, **kwargs):
+    r"""ResNet-34 model from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _resnet('resnet34', BasicBlock, [3, 4, 6, 3], pretrained, progress,
+                   **kwargs)
+def resnet50(pretrained=False, progress=True, **kwargs):
+    r"""ResNet-50 model from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _resnet('resnet50', Bottleneck, [3, 4, 6, 3], pretrained, progress,
+                   **kwargs)
+def resnet101(pretrained=False, progress=True, **kwargs):
+    r"""ResNet-101 model from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _resnet('resnet101', Bottleneck, [3, 4, 23, 3], pretrained, progress,
+                   **kwargs)
+def resnet152(pretrained=False, progress=True, **kwargs):
+    r"""ResNet-152 model from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _resnet('resnet152', Bottleneck, [3, 8, 36, 3], pretrained, progress,
+                   **kwargs)
+def resnext50_32x4d(pretrained=False, progress=True, **kwargs):
+    r"""ResNeXt-50 32x4d model from
+    `"Aggregated Residual Transformation for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    kwargs['groups'] = 32
+    kwargs['width_per_group'] = 4
+    return _resnet('resnext50_32x4d', Bottleneck, [3, 4, 6, 3],
+                   pretrained, progress, **kwargs)
+def resnext101_32x8d(pretrained=False, progress=True, **kwargs):
+    r"""ResNeXt-101 32x8d model from
+    `"Aggregated Residual Transformation for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    kwargs['groups'] = 32
+    kwargs['width_per_group'] = 8
+    return _resnet('resnext101_32x8d', Bottleneck, [3, 4, 23, 3],
+                   pretrained, progress, **kwargs)
+def wide_resnet50_2(pretrained=False, progress=True, **kwargs):
+    r"""Wide ResNet-50-2 model from
+    `"Wide Residual Networks" <https://arxiv.org/pdf/1605.07146.pdf>`_
+    The model is the same as ResNet except for the bottleneck number of channels
+    which is twice larger in every block. The number of channels in outer 1x1
+    convolutions is the same, e.g. last block in ResNet-50 has 2048-512-2048
+    channels, and in Wide ResNet-50-2 has 2048-1024-2048.
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    kwargs['width_per_group'] = 64 * 2
+    return _resnet('wide_resnet50_2', Bottleneck, [3, 4, 6, 3],
+                   pretrained, progress, **kwargs)
+def wide_resnet101_2(pretrained=False, progress=True, **kwargs):
+    r"""Wide ResNet-101-2 model from
+    `"Wide Residual Networks" <https://arxiv.org/pdf/1605.07146.pdf>`_
+    The model is the same as ResNet except for the bottleneck number of channels
+    which is twice larger in every block. The number of channels in outer 1x1
+    convolutions is the same, e.g. last block in ResNet-50 has 2048-512-2048
+    channels, and in Wide ResNet-50-2 has 2048-1024-2048.
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    kwargs['width_per_group'] = 64 * 2
+    return _resnet('wide_resnet101_2', Bottleneck, [3, 4, 23, 3],
+                   pretrained, progress, **kwargs)

sod/Swin.py ADDED Viewed

	@@ -0,0 +1,578 @@

+import torch
+import torch.nn as nn
+import torch.utils.checkpoint as checkpoint
+from timm.models.layers import DropPath, to_2tuple, trunc_normal_
+class Mlp(nn.Module):
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+def window_partition(x, window_size):
+    """
+    Args:
+        x: (B, H, W, C)
+        window_size (int): window size
+    Returns:
+        windows: (num_windows*B, window_size, window_size, C)
+    """
+    B, H, W, C = x.shape
+    x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
+    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
+    return windows
+def window_reverse(windows, window_size, H, W):
+    """
+    Args:
+        windows: (num_windows*B, window_size, window_size, C)
+        window_size (int): Window size
+        H (int): Height of image
+        W (int): Width of image
+    Returns:
+        x: (B, H, W, C)
+    """
+    B = int(windows.shape[0] / (H * W / window_size / window_size))
+    x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+    return x
+class WindowAttention(nn.Module):
+    r""" Window based multi-head self attention (W-MSA) module with relative position bias.
+    It supports both of shifted and non-shifted window.
+    Args:
+        dim (int): Number of input channels.
+        window_size (tuple[int]): The height and width of the window.
+        num_heads (int): Number of attention heads.
+        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
+        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
+        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+    """
+    def __init__(self, dim, window_size, num_heads, qkv_bias=True, qk_scale=None, attn_drop=0., proj_drop=0.):
+        super().__init__()
+        self.dim = dim
+        self.window_size = window_size  # Wh, Ww
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+        # define a parameter table of relative position bias
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads))  # 2*Wh-1 * 2*Ww-1, nH
+        # get pair-wise relative position index for each token inside the window
+        coords_h = torch.arange(self.window_size[0])
+        coords_w = torch.arange(self.window_size[1])
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :, 0] += self.window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+        self.register_buffer("relative_position_index", relative_position_index)
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        trunc_normal_(self.relative_position_bias_table, std=.02)
+        self.softmax = nn.Softmax(dim=-1)
+    def forward(self, x, mask=None):
+        """
+        Args:
+            x: input features with shape of (num_windows*B, N, C)
+            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
+        """
+        B_, N, C = x.shape
+        qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]  # make torchscript happy (cannot use tensor as tuple)
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))
+        relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
+            self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1)  # Wh*Ww,Wh*Ww,nH
+        relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+        attn = attn + relative_position_bias.unsqueeze(0)
+        if mask is not None:
+            nW = mask.shape[0]
+            attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.view(-1, self.num_heads, N, N)
+            attn = self.softmax(attn)
+        else:
+            attn = self.softmax(attn)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+    def extra_repr(self) -> str:
+        return f'dim={self.dim}, window_size={self.window_size}, num_heads={self.num_heads}'
+    def flops(self, N):
+        # calculate flops for 1 window with token length of N
+        flops = 0
+        # qkv = self.qkv(x)
+        flops += N * self.dim * 3 * self.dim
+        # attn = (q @ k.transpose(-2, -1))
+        flops += self.num_heads * N * (self.dim // self.num_heads) * N
+        #  x = (attn @ v)
+        flops += self.num_heads * N * N * (self.dim // self.num_heads)
+        # x = self.proj(x)
+        flops += N * self.dim * self.dim
+        return flops
+class SwinTransformerBlock(nn.Module):
+    r""" Swin Transformer Block.
+    Args:
+        dim (int): Number of input channels.
+        input_resolution (tuple[int]): Input resulotion.
+        num_heads (int): Number of attention heads.
+        window_size (int): Window size.
+        shift_size (int): Shift size for SW-MSA.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float, optional): Stochastic depth rate. Default: 0.0
+        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+    def __init__(self, dim, input_resolution, num_heads, window_size=7, shift_size=0,
+                 mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., drop_path=0.,
+                 act_layer=nn.GELU, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.shift_size = shift_size
+        self.mlp_ratio = mlp_ratio
+        if min(self.input_resolution) <= self.window_size:
+            # if window size is larger than input resolution, we don't partition windows
+            self.shift_size = 0
+            self.window_size = min(self.input_resolution)
+        assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
+        self.norm1 = norm_layer(dim)
+        self.attn = WindowAttention(
+            dim, window_size=to_2tuple(self.window_size), num_heads=num_heads,
+            qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+        if self.shift_size > 0:
+            # calculate attention mask for SW-MSA
+            H, W = self.input_resolution
+            img_mask = torch.zeros((1, H, W, 1))  # 1 H W 1
+            h_slices = (slice(0, -self.window_size),
+                        slice(-self.window_size, -self.shift_size),
+                        slice(-self.shift_size, None))
+            w_slices = (slice(0, -self.window_size),
+                        slice(-self.window_size, -self.shift_size),
+                        slice(-self.shift_size, None))
+            cnt = 0
+            for h in h_slices:
+                for w in w_slices:
+                    img_mask[:, h, w, :] = cnt
+                    cnt += 1
+            mask_windows = window_partition(img_mask, self.window_size)  # nW, window_size, window_size, 1
+            mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
+            atten_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+            atten_mask = atten_mask.masked_fill(atten_mask != 0, float(-100.0)).masked_fill(atten_mask == 0, float(0.0))
+        else:
+            atten_mask = None
+        self.register_buffer("atten_mask", atten_mask)
+    def forward(self, x):
+        H, W = self.input_resolution
+        B, L, C = x.shape
+        assert L == H * W, "input feature has wrong size"
+        shortcut = x
+        x = self.norm1(x)
+        x = x.view(B, H, W, C)
+        # cyclic shift
+        if self.shift_size > 0:
+            shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
+        else:
+            shifted_x = x
+        # partition windows
+        x_windows = window_partition(shifted_x, self.window_size)  # nW*B, window_size, window_size, C
+        x_windows = x_windows.view(-1, self.window_size * self.window_size, C)  # nW*B, window_size*window_size, C
+        # W-MSA/SW-MSA
+        attn_windows = self.attn(x_windows, mask=self.atten_mask)  # nW*B, window_size*window_size, C
+        # merge windows
+        attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
+        shifted_x = window_reverse(attn_windows, self.window_size, H, W)  # B H' W' C
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
+        else:
+            x = shifted_x
+        x = x.view(B, H * W, C)
+        # FFN
+        x = shortcut + self.drop_path(x)
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+    def extra_repr(self) -> str:
+        return f"dim={self.dim}, input_resolution={self.input_resolution}, num_heads={self.num_heads}, " \
+               f"window_size={self.window_size}, shift_size={self.shift_size}, mlp_ratio={self.mlp_ratio}"
+    def flops(self):
+        flops = 0
+        H, W = self.input_resolution
+        # norm1
+        flops += self.dim * H * W
+        # W-MSA/SW-MSA
+        nW = H * W / self.window_size / self.window_size
+        flops += nW * self.attn.flops(self.window_size * self.window_size)
+        # mlp
+        flops += 2 * H * W * self.dim * self.dim * self.mlp_ratio
+        # norm2
+        flops += self.dim * H * W
+        return flops
+class PatchMerging(nn.Module):
+    r""" Patch Merging Layer.
+    Args:
+        input_resolution (tuple[int]): Resolution of input feature.
+        dim (int): Number of input channels.
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+    def __init__(self, input_resolution, dim, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.input_resolution = input_resolution
+        self.dim = dim
+        self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
+        self.norm = norm_layer(4 * dim)
+    def forward(self, x):
+        """
+        x: B, H*W, C
+        """
+        H, W = self.input_resolution
+        B, L, C = x.shape
+        assert L == H * W, "input feature has wrong size"
+        assert H % 2 == 0 and W % 2 == 0, f"x size ({H}*{W}) are not even."
+        x = x.view(B, H, W, C)
+        x0 = x[:, 0::2, 0::2, :]  # B H/2 W/2 C
+        x1 = x[:, 1::2, 0::2, :]  # B H/2 W/2 C
+        x2 = x[:, 0::2, 1::2, :]  # B H/2 W/2 C
+        x3 = x[:, 1::2, 1::2, :]  # B H/2 W/2 C
+        x = torch.cat([x0, x1, x2, x3], -1)  # B H/2 W/2 4*C
+        x = x.view(B, -1, 4 * C)  # B H/2*W/2 4*C
+        x = self.norm(x)
+        x = self.reduction(x)
+        return x
+    def extra_repr(self) -> str:
+        return f"input_resolution={self.input_resolution}, dim={self.dim}"
+    def flops(self):
+        H, W = self.input_resolution
+        flops = H * W * self.dim
+        flops += (H // 2) * (W // 2) * 4 * self.dim * 2 * self.dim
+        return flops
+class BasicLayer(nn.Module):
+    """ A basic Swin Transformer layer for one stage.
+    Args:
+        dim (int): Number of input channels.
+        input_resolution (tuple[int]): Input resolution.
+        depth (int): Number of blocks.
+        num_heads (int): Number of attention heads.
+        window_size (int): Local window size.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
+        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+    """
+    def __init__(self, dim, input_resolution, depth, num_heads, window_size,
+                 mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0.,
+                 drop_path=0., norm_layer=nn.LayerNorm, downsample=None, use_checkpoint=False):
+        super().__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.depth = depth
+        self.use_checkpoint = use_checkpoint
+        # build blocks
+        self.blocks = nn.ModuleList([
+            SwinTransformerBlock(dim=dim, input_resolution=input_resolution,
+                                 num_heads=num_heads, window_size=window_size,
+                                 shift_size=0 if (i % 2 == 0) else window_size // 2,
+                                 mlp_ratio=mlp_ratio,
+                                 qkv_bias=qkv_bias, qk_scale=qk_scale,
+                                 drop=drop, attn_drop=attn_drop,
+                                 drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
+                                 norm_layer=norm_layer)
+            for i in range(depth)])
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(input_resolution, dim=dim, norm_layer=norm_layer)
+        else:
+            self.downsample = None
+    def forward(self, x):
+        for blk in self.blocks:
+            if self.use_checkpoint:
+                x = checkpoint.checkpoint(blk, x)
+            else:
+                x = blk(x)
+        if self.downsample is not None:
+            x = self.downsample(x)
+        return x
+    def extra_repr(self) -> str:
+        return f"dim={self.dim}, input_resolution={self.input_resolution}, depth={self.depth}"
+    def flops(self):
+        flops = 0
+        for blk in self.blocks:
+            flops += blk.flops()
+        if self.downsample is not None:
+            flops += self.downsample.flops()
+        return flops
+class PatchEmbed(nn.Module):
+    r""" Image to Patch Embedding
+    Args:
+        img_size (int): Image size.  Default: 224.
+        patch_size (int): Patch token size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        norm_layer (nn.Module, optional): Normalization layer. Default: None
+    """
+    def __init__(self, img_size=224, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        patches_resolution = [img_size[0] // patch_size[0], img_size[1] // patch_size[1]]
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.patches_resolution = patches_resolution
+        self.num_patches = patches_resolution[0] * patches_resolution[1]
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+        if norm_layer is not None:
+            self.norm = norm_layer(embed_dim)
+        else:
+            self.norm = None
+    def forward(self, x):
+        B, C, H, W = x.shape
+        # FIXME look at relaxing size constraints
+        assert H == self.img_size[0] and W == self.img_size[1], \
+            f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+        x = self.proj(x).flatten(2).transpose(1, 2)  # B Ph*Pw C
+        if self.norm is not None:
+            x = self.norm(x)
+        return x
+    def flops(self):
+        Ho, Wo = self.patches_resolution
+        flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1])
+        if self.norm is not None:
+            flops += Ho * Wo * self.embed_dim
+        return flops
+class Swintransformer(nn.Module):
+    r""" Swin Transformer
+        A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows`  -
+          https://arxiv.org/pdf/2103.14030
+    Args:
+        img_size (int | tuple(int)): Input image size. Default 224
+        patch_size (int | tuple(int)): Patch size. Default: 4
+        in_chans (int): Number of input image channels. Default: 3
+        num_classes (int): Number of classes for classification head. Default: 1000
+        embed_dim (int): Patch embedding dimension. Default: 96
+        depths (tuple(int)): Depth of each Swin Transformer layer.
+        num_heads (tuple(int)): Number of attention heads in different layers.
+        window_size (int): Window size. Default: 7
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4
+        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. Default: None
+        drop_rate (float): Dropout rate. Default: 0
+        attn_drop_rate (float): Attention dropout rate. Default: 0
+        drop_path_rate (float): Stochastic depth rate. Default: 0.1
+        norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
+        ape (bool): If True, add absolute position embedding to the patch embedding. Default: False
+        patch_norm (bool): If True, add normalization after patch embedding. Default: True
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False
+    """
+    def __init__(self, img_size=224, patch_size=4, in_chans=3, num_classes=1000,
+                 embed_dim=128, depths=[2, 2, 18,2], num_heads=[4, 8, 16, 32],
+                 window_size=7, mlp_ratio=4., qkv_bias=True, qk_scale=None,
+                 drop_rate=0., attn_drop_rate=0., drop_path_rate=0.5,
+                 norm_layer=nn.LayerNorm, ape=False, patch_norm=True,
+                 use_checkpoint=False, **kwargs):
+        super().__init__()
+        self.num_classes = num_classes
+        self.num_layers = len(depths)
+        self.embed_dim = embed_dim
+        self.ape = ape
+        self.patch_norm = patch_norm
+        self.num_features = int(embed_dim * 2 ** (self.num_layers - 1))
+        self.mlp_ratio = mlp_ratio
+        # split image into non-overlapping patches
+        self.patch_embed = PatchEmbed(
+            img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim,
+            norm_layer=norm_layer if self.patch_norm else None)
+        num_patches = self.patch_embed.num_patches
+        patches_resolution = self.patch_embed.patches_resolution
+        self.patches_resolution = patches_resolution
+        # absolute position embedding
+        if self.ape:
+            self.absolute_pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim))
+            trunc_normal_(self.absolute_pos_embed, std=.02)
+        self.pos_drop = nn.Dropout(p=drop_rate)
+        # stochastic depth
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]  # stochastic depth decay rule
+        # build layers
+        self.layers = nn.ModuleList()
+        for i_layer in range(self.num_layers-1):
+            layer = BasicLayer(dim=int(embed_dim * 2 ** i_layer),
+                               input_resolution=(patches_resolution[0] // (2 ** i_layer ),
+                                                 patches_resolution[1] // (2 ** i_layer )),
+                               depth=depths[i_layer],
+                               num_heads=num_heads[i_layer],
+                               window_size=window_size,
+                               mlp_ratio=self.mlp_ratio,
+                               qkv_bias=qkv_bias, qk_scale=qk_scale,
+                               drop=drop_rate, attn_drop=attn_drop_rate,
+                               drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
+                               norm_layer=norm_layer,
+                               downsample=PatchMerging if (i_layer < self.num_layers - 2) else None,
+                               use_checkpoint=use_checkpoint)
+            self.layers.append(layer)
+        self.norm = norm_layer(self.num_features)
+        self.avgpool = nn.AdaptiveAvgPool1d(1)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'absolute_pos_embed'}
+    @torch.jit.ignore
+    def no_weight_decay_keywords(self):
+        return {'relative_position_bias_table'}
+    def forward_features(self, x):
+        b,c,h,w = x.shape
+        x = self.patch_embed(x)
+        if self.ape:
+            x = x + self.absolute_pos_embed
+        x = self.pos_drop(x)
+        s = []
+        s.append(x.view(b, int((x.shape[1])**0.5),int((x.shape[1])**(0.5)), -1).permute(0, 3, 1, 2).contiguous())
+        for layer in self.layers:
+            x = layer(x)
+            s.append(x.view(b, int((x.shape[1])**0.5),int((x.shape[1])**(0.5)), -1).permute(0, 3, 1, 2).contiguous())
+      #  x = self.norm(x)  # B L C
+      #  x = self.avgpool(x.transpose(1, 2))  # B C 1
+      #  x = torch.flatten(x, 1)
+        return s
+    def forward(self, x):
+        x = self.forward_features(x)
+     #   x = self.head(x)
+        return x
+    def flops(self):
+        flops = 0
+        flops += self.patch_embed.flops()
+        for i, layer in enumerate(self.layers):
+            flops += layer.flops()
+        return flops

sod/configs/prediction/default.yaml ADDED Viewed

	@@ -0,0 +1,14 @@

+indir: no  # to be overriden in CLI
+outdir: no  # to be overriden in CLI
+model:
+  path: no  # to be overriden in CLI
+  checkpoint: best.ckpt
+dataset:
+  kind: default
+  img_suffix: .png
+  pad_out_to_modulo: 8
+device: cuda
+out_key: inpainted

sod/infer_model.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import os
+import cv2
+import numpy as np
+import torch
+import torch.nn.functional as F
+import sys
+sys.path.insert(0, '../')
+sys.dont_write_bytecode = True
+from .PGNet import PGNet
+class Normalize(object):
+    def __init__(self, mean, std):
+        self.mean = mean
+        self.std  = std
+    def __call__(self, image):
+        image = (image - self.mean)/self.std
+        return image
+class Config(object):
+    def __init__(self, **kwargs):
+        self.kwargs = kwargs
+        self.mean   = np.array([[[124.55, 118.90, 102.94]]])
+        self.std    = np.array([[[ 56.77,  55.97,  57.50]]])
+        print('\nParameters...')
+        for k, v in self.kwargs.items():
+            print('%-10s: %s'%(k, v))
+    def __getattr__(self, name):
+        if name in self.kwargs:
+            return self.kwargs[name]
+        else:
+            return None
+class IVModel():
+    def __init__(self, device=torch.device('cuda:0')):
+        super(IVModel, self).__init__()
+        self.device = device
+        checkpoint_path = 'sod/weights/PGNet_DUT+HR-model-31.pth'
+        self.cfg = Config(snapshot=checkpoint_path, mode='test')
+        if not os.path.exists(checkpoint_path):
+            print('未找到模型文件！')
+        self.net  = PGNet(self.cfg)
+        self.net.train(False)
+        self.net.to(device)
+        self.normalize  = Normalize(mean=self.cfg.mean, std=self.cfg.std)
+        self.__first_forward__()
+    def __first_forward__(self, input_size=(2048, 2048, 3)):
+        # 调用forward()严格控制最大显存
+        print('initialize Sod Model...')
+        _ = self.forward(np.random.rand(*input_size) * 255, None)
+        print('initialize Complete!')
+    def __resize_tensor__(self, image, max_size=1024):
+        h, w = image.size()[2:]
+        if max(h, w) > max_size:
+            if h < w:
+                h, w = int(max_size * h / w)//8*8, max_size
+            else:
+                h, w = max_size, int(max_size * w / h)//8*8
+        image = F.interpolate(image, (h, w), mode='area')
+        return image
+    def input_preprocess_tensor(self, img):
+        img = self.normalize(img)
+        img_t = torch.from_numpy(img.astype(np.float32))  # .to(self.device)
+        img_t = img_t.permute(2, 0, 1).unsqueeze(0)
+        img_t = self.__resize_tensor__(img_t).to(self.device)  # 为了控制最大显存容量
+        return img_t
+    def forward(self, img, json_data):
+        img_t = self.input_preprocess_tensor(img)
+        shape = [torch.as_tensor([img_t.shape[2]]), torch.as_tensor([img_t.shape[3]])]
+        h, w = img_t.shape[2], img_t.shape[3]
+        img_t_temp = F.interpolate(img_t, (1024, 1024), mode='area')
+        with torch.no_grad():
+            res = self.net(img_t_temp, shape=shape)
+        res   = F.interpolate(res[0],size=shape, mode='bilinear')
+        res   = torch.sigmoid(res)
+        # print(res.shape, img_t.shape, res.expand_as(img_t).shape)
+        res = torch.cat([img_t, res.expand_as(img_t)], dim=3)
+        res  = (res[0].permute(1,2,0)).cpu().numpy()
+        res[:,:w,:] = res[:,:w,:] * self.cfg.std + self.cfg.mean
+        res[:,w:,:] = res[:,w:,:] * 255
+        return res