Spaces:
Runtime error
Runtime error
File size: 4,266 Bytes
412c852 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
# model settings
norm_cfg = dict(type='SyncBN', requires_grad=True)
data_preprocessor = dict(
type='SegDataPreProcessor',
mean=[122.7709, 116.7460, 104.0937],
std=[68.5005, 66.6322, 70.3232],
bgr_to_rgb=True,
pad_val=0,
seg_pad_val=255,
size_divisor=640,
test_cfg=dict(size_divisor=32))
num_classes = 171
model = dict(
type='MultimodalEncoderDecoder',
data_preprocessor=data_preprocessor,
pretrained='pretrain/clip_vit_base_patch16_224.pth',
asymetric_input=True,
encoder_resolution=0.5,
image_encoder=dict(
type='VisionTransformer',
img_size=(224, 224),
patch_size=16,
patch_pad=0,
in_channels=3,
embed_dims=768,
num_layers=9,
num_heads=12,
mlp_ratio=4,
out_origin=True,
out_indices=(2, 5, 8),
qkv_bias=True,
drop_rate=0.0,
attn_drop_rate=0.0,
drop_path_rate=0.0,
with_cls_token=True,
output_cls_token=True,
patch_bias=False,
pre_norm=True,
norm_cfg=dict(type='LN', eps=1e-5),
act_cfg=dict(type='QuickGELU'),
norm_eval=False,
interpolate_mode='bicubic',
frozen_exclude=['pos_embed']),
text_encoder=dict(
type='CLIPTextEncoder',
dataset_name=None,
templates='vild',
embed_dims=512,
num_layers=12,
num_heads=8,
mlp_ratio=4,
output_dims=512,
cache_feature=True,
cat_bg=True,
norm_cfg=dict(type='LN', eps=1e-5)
),
decode_head=dict(
type='SideAdapterCLIPHead',
num_classes=num_classes,
deep_supervision_idxs=[7],
san_cfg=dict(
in_channels=3,
clip_channels=768,
embed_dims=240,
patch_size=16,
patch_bias=True,
num_queries=100,
cfg_encoder=dict(
num_encode_layer=8,
num_heads=6,
mlp_ratio=4
),
fusion_index=[0, 1, 2, 3],
cfg_decoder=dict(
num_heads=12,
num_layers=1,
embed_channels=256,
mlp_channels=256,
num_mlp=3,
rescale=True),
norm_cfg=dict(type='LN', eps=1e-6),
),
maskgen_cfg=dict(
sos_token_format='cls_token',
sos_token_num=100,
cross_attn=False,
num_layers=3,
embed_dims=768,
num_heads=12,
mlp_ratio=4,
qkv_bias=True,
out_dims=512,
final_norm=True,
act_cfg=dict(type='QuickGELU'),
norm_cfg=dict(type='LN', eps=1e-5),
frozen_exclude=[]
),
align_corners=False,
train_cfg=dict(
num_points=12544,
oversample_ratio=3.0,
importance_sample_ratio=0.75,
assigner=dict(
type='HungarianAssigner',
match_costs=[
dict(type='ClassificationCost', weight=2.0),
dict(
type='CrossEntropyLossCost',
weight=5.0,
use_sigmoid=True),
dict(
type='DiceCost',
weight=5.0,
pred_act=True,
eps=1.0)
])),
loss_decode=[dict(type='CrossEntropyLoss',
loss_name='loss_cls_ce',
loss_weight=2.0,
class_weight=[1.0] * num_classes + [0.1]),
dict(type='CrossEntropyLoss',
use_sigmoid=True,
loss_name='loss_mask_ce',
loss_weight=5.0),
dict(type='DiceLoss',
ignore_index=None,
naive_dice=True,
eps=1,
loss_name='loss_mask_dice',
loss_weight=5.0)
]),
# model training and testing settings
train_cfg=dict(),
test_cfg=dict(mode='whole')) # yapf: disable
|