Spaces:
Runtime error
Runtime error
# model settings | |
norm_cfg = dict(type='SyncBN', requires_grad=True) | |
data_preprocessor = dict( | |
type='SegDataPreProcessor', | |
mean=[122.7709, 116.7460, 104.0937], | |
std=[68.5005, 66.6322, 70.3232], | |
bgr_to_rgb=True, | |
pad_val=0, | |
seg_pad_val=255, | |
size_divisor=640, | |
test_cfg=dict(size_divisor=32)) | |
num_classes = 171 | |
model = dict( | |
type='MultimodalEncoderDecoder', | |
data_preprocessor=data_preprocessor, | |
pretrained='pretrain/clip_vit_base_patch16_224.pth', | |
asymetric_input=True, | |
encoder_resolution=0.5, | |
image_encoder=dict( | |
type='VisionTransformer', | |
img_size=(224, 224), | |
patch_size=16, | |
patch_pad=0, | |
in_channels=3, | |
embed_dims=768, | |
num_layers=9, | |
num_heads=12, | |
mlp_ratio=4, | |
out_origin=True, | |
out_indices=(2, 5, 8), | |
qkv_bias=True, | |
drop_rate=0.0, | |
attn_drop_rate=0.0, | |
drop_path_rate=0.0, | |
with_cls_token=True, | |
output_cls_token=True, | |
patch_bias=False, | |
pre_norm=True, | |
norm_cfg=dict(type='LN', eps=1e-5), | |
act_cfg=dict(type='QuickGELU'), | |
norm_eval=False, | |
interpolate_mode='bicubic', | |
frozen_exclude=['pos_embed']), | |
text_encoder=dict( | |
type='CLIPTextEncoder', | |
dataset_name=None, | |
templates='vild', | |
embed_dims=512, | |
num_layers=12, | |
num_heads=8, | |
mlp_ratio=4, | |
output_dims=512, | |
cache_feature=True, | |
cat_bg=True, | |
norm_cfg=dict(type='LN', eps=1e-5) | |
), | |
decode_head=dict( | |
type='SideAdapterCLIPHead', | |
num_classes=num_classes, | |
deep_supervision_idxs=[7], | |
san_cfg=dict( | |
in_channels=3, | |
clip_channels=768, | |
embed_dims=240, | |
patch_size=16, | |
patch_bias=True, | |
num_queries=100, | |
cfg_encoder=dict( | |
num_encode_layer=8, | |
num_heads=6, | |
mlp_ratio=4 | |
), | |
fusion_index=[0, 1, 2, 3], | |
cfg_decoder=dict( | |
num_heads=12, | |
num_layers=1, | |
embed_channels=256, | |
mlp_channels=256, | |
num_mlp=3, | |
rescale=True), | |
norm_cfg=dict(type='LN', eps=1e-6), | |
), | |
maskgen_cfg=dict( | |
sos_token_format='cls_token', | |
sos_token_num=100, | |
cross_attn=False, | |
num_layers=3, | |
embed_dims=768, | |
num_heads=12, | |
mlp_ratio=4, | |
qkv_bias=True, | |
out_dims=512, | |
final_norm=True, | |
act_cfg=dict(type='QuickGELU'), | |
norm_cfg=dict(type='LN', eps=1e-5), | |
frozen_exclude=[] | |
), | |
align_corners=False, | |
train_cfg=dict( | |
num_points=12544, | |
oversample_ratio=3.0, | |
importance_sample_ratio=0.75, | |
assigner=dict( | |
type='HungarianAssigner', | |
match_costs=[ | |
dict(type='ClassificationCost', weight=2.0), | |
dict( | |
type='CrossEntropyLossCost', | |
weight=5.0, | |
use_sigmoid=True), | |
dict( | |
type='DiceCost', | |
weight=5.0, | |
pred_act=True, | |
eps=1.0) | |
])), | |
loss_decode=[dict(type='CrossEntropyLoss', | |
loss_name='loss_cls_ce', | |
loss_weight=2.0, | |
class_weight=[1.0] * num_classes + [0.1]), | |
dict(type='CrossEntropyLoss', | |
use_sigmoid=True, | |
loss_name='loss_mask_ce', | |
loss_weight=5.0), | |
dict(type='DiceLoss', | |
ignore_index=None, | |
naive_dice=True, | |
eps=1, | |
loss_name='loss_mask_dice', | |
loss_weight=5.0) | |
]), | |
# model training and testing settings | |
train_cfg=dict(), | |
test_cfg=dict(mode='whole')) # yapf: disable | |