_base_ = [ '../_base_/datasets/ade20k.py', '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py' ] norm_cfg = dict(type='SyncBN', requires_grad=True) crop_size = (512, 512) data_preprocessor = dict( type='SegDataPreProcessor', size=crop_size, mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], bgr_to_rgb=True, pad_val=0, seg_pad_val=255) # model_cfg num_classes = 150 model = dict( type='EncoderDecoder', data_preprocessor=data_preprocessor, backbone=dict( type='ResNet', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), dilations=(1, 1, 1, 1), strides=(1, 2, 2, 2), norm_cfg=norm_cfg, norm_eval=True, style='pytorch', contract_dilation=True, init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')), decode_head=dict( type='MaskFormerHead', in_channels=[256, 512, 1024, 2048], # input channels of pixel_decoder modules feat_channels=256, in_index=[0, 1, 2, 3], num_classes=150, out_channels=256, num_queries=100, pixel_decoder=dict( type='mmdet.PixelDecoder', norm_cfg=dict(type='GN', num_groups=32), act_cfg=dict(type='ReLU')), enforce_decoder_input_project=False, positional_encoding=dict( # SinePositionalEncoding num_feats=128, normalize=True), transformer_decoder=dict( # DetrTransformerDecoder return_intermediate=True, num_layers=6, layer_cfg=dict( # DetrTransformerDecoderLayer self_attn_cfg=dict( # MultiheadAttention embed_dims=256, num_heads=8, attn_drop=0.1, proj_drop=0.1, dropout_layer=None, batch_first=True), cross_attn_cfg=dict( # MultiheadAttention embed_dims=256, num_heads=8, attn_drop=0.1, proj_drop=0.1, dropout_layer=None, batch_first=True), ffn_cfg=dict( embed_dims=256, feedforward_channels=2048, num_fcs=2, act_cfg=dict(type='ReLU', inplace=True), ffn_drop=0.1, dropout_layer=None, add_identity=True)), init_cfg=None), loss_cls=dict( type='mmdet.CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0, reduction='mean', class_weight=[1.0] * num_classes + [0.1]), loss_mask=dict( type='mmdet.FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, reduction='mean', loss_weight=20.0), loss_dice=dict( type='mmdet.DiceLoss', use_sigmoid=True, activate=True, reduction='mean', naive_dice=True, eps=1.0, loss_weight=1.0), train_cfg=dict( assigner=dict( type='mmdet.HungarianAssigner', match_costs=[ dict(type='mmdet.ClassificationCost', weight=1.0), dict( type='mmdet.FocalLossCost', weight=20.0, binary_input=True), dict( type='mmdet.DiceCost', weight=1.0, pred_act=True, eps=1.0) ]), sampler=dict(type='mmdet.MaskPseudoSampler'))), # training and testing settings train_cfg=dict(), test_cfg=dict(mode='whole'), ) # optimizer optimizer = dict( type='AdamW', lr=0.0001, betas=(0.9, 0.999), weight_decay=0.0001) optim_wrapper = dict( _delete_=True, type='OptimWrapper', optimizer=optimizer, clip_grad=dict(max_norm=0.01, norm_type=2), paramwise_cfg=dict(custom_keys={ 'backbone': dict(lr_mult=0.1), })) # learning policy param_scheduler = [ dict( type='PolyLR', eta_min=0, power=0.9, begin=0, end=160000, by_epoch=False) ] # In MaskFormer implementation we use batch size 2 per GPU as default train_dataloader = dict(batch_size=2, num_workers=2) val_dataloader = dict(batch_size=1, num_workers=4) test_dataloader = val_dataloader