snnetv2-semantic-segmentation / configs /maskformer /maskformer_r50-d32_8xb2-160k_ade20k-512x512.py
HubHop
update
412c852
raw
history blame
4.62 kB
_base_ = [
'../_base_/datasets/ade20k.py', '../_base_/default_runtime.py',
'../_base_/schedules/schedule_160k.py'
]
norm_cfg = dict(type='SyncBN', requires_grad=True)
crop_size = (512, 512)
data_preprocessor = dict(
type='SegDataPreProcessor',
size=crop_size,
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
bgr_to_rgb=True,
pad_val=0,
seg_pad_val=255)
# model_cfg
num_classes = 150
model = dict(
type='EncoderDecoder',
data_preprocessor=data_preprocessor,
backbone=dict(
type='ResNet',
depth=50,
num_stages=4,
out_indices=(0, 1, 2, 3),
dilations=(1, 1, 1, 1),
strides=(1, 2, 2, 2),
norm_cfg=norm_cfg,
norm_eval=True,
style='pytorch',
contract_dilation=True,
init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
decode_head=dict(
type='MaskFormerHead',
in_channels=[256, 512, 1024,
2048], # input channels of pixel_decoder modules
feat_channels=256,
in_index=[0, 1, 2, 3],
num_classes=150,
out_channels=256,
num_queries=100,
pixel_decoder=dict(
type='mmdet.PixelDecoder',
norm_cfg=dict(type='GN', num_groups=32),
act_cfg=dict(type='ReLU')),
enforce_decoder_input_project=False,
positional_encoding=dict( # SinePositionalEncoding
num_feats=128, normalize=True),
transformer_decoder=dict( # DetrTransformerDecoder
return_intermediate=True,
num_layers=6,
layer_cfg=dict( # DetrTransformerDecoderLayer
self_attn_cfg=dict( # MultiheadAttention
embed_dims=256,
num_heads=8,
attn_drop=0.1,
proj_drop=0.1,
dropout_layer=None,
batch_first=True),
cross_attn_cfg=dict( # MultiheadAttention
embed_dims=256,
num_heads=8,
attn_drop=0.1,
proj_drop=0.1,
dropout_layer=None,
batch_first=True),
ffn_cfg=dict(
embed_dims=256,
feedforward_channels=2048,
num_fcs=2,
act_cfg=dict(type='ReLU', inplace=True),
ffn_drop=0.1,
dropout_layer=None,
add_identity=True)),
init_cfg=None),
loss_cls=dict(
type='mmdet.CrossEntropyLoss',
use_sigmoid=False,
loss_weight=1.0,
reduction='mean',
class_weight=[1.0] * num_classes + [0.1]),
loss_mask=dict(
type='mmdet.FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
reduction='mean',
loss_weight=20.0),
loss_dice=dict(
type='mmdet.DiceLoss',
use_sigmoid=True,
activate=True,
reduction='mean',
naive_dice=True,
eps=1.0,
loss_weight=1.0),
train_cfg=dict(
assigner=dict(
type='mmdet.HungarianAssigner',
match_costs=[
dict(type='mmdet.ClassificationCost', weight=1.0),
dict(
type='mmdet.FocalLossCost',
weight=20.0,
binary_input=True),
dict(
type='mmdet.DiceCost',
weight=1.0,
pred_act=True,
eps=1.0)
]),
sampler=dict(type='mmdet.MaskPseudoSampler'))),
# training and testing settings
train_cfg=dict(),
test_cfg=dict(mode='whole'),
)
# optimizer
optimizer = dict(
type='AdamW', lr=0.0001, betas=(0.9, 0.999), weight_decay=0.0001)
optim_wrapper = dict(
_delete_=True,
type='OptimWrapper',
optimizer=optimizer,
clip_grad=dict(max_norm=0.01, norm_type=2),
paramwise_cfg=dict(custom_keys={
'backbone': dict(lr_mult=0.1),
}))
# learning policy
param_scheduler = [
dict(
type='PolyLR',
eta_min=0,
power=0.9,
begin=0,
end=160000,
by_epoch=False)
]
# In MaskFormer implementation we use batch size 2 per GPU as default
train_dataloader = dict(batch_size=2, num_workers=2)
val_dataloader = dict(batch_size=1, num_workers=4)
test_dataloader = val_dataloader