Spaces:

kernel-luso-comfort
/

BiomedParse

Runtime error

App Files Files Community

BiomedParse / modeling /vision /backbone /davit.py

kernel-luso-comfort

Add initial module structure and entry points for modeling and utilities

6ba63c9 11 months ago

raw

history blame

20.9 kB

	import os
	import itertools
	import logging

	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	import torch.utils.checkpoint as checkpoint
	from collections import OrderedDict

	from einops import rearrange
	from timm.models.layers import DropPath, trunc_normal_

	from detectron2.utils.file_io import PathManager
	from detectron2.modeling import BACKBONE_REGISTRY, Backbone, ShapeSpec

	from .build import register_backbone

	logger = logging.getLogger(__name__)



	class MySequential(nn.Sequential):
	def forward(self, *inputs):
	for module in self._modules.values():
	if type(inputs) == tuple:
	inputs = module(*inputs)
	else:
	inputs = module(inputs)
	return inputs


	class PreNorm(nn.Module):
	def __init__(self, norm, fn, drop_path=None):
	super().__init__()
	self.norm = norm
	self.fn = fn
	self.drop_path = drop_path

	def forward(self, x, args, *kwargs):
	shortcut = x
	if self.norm != None:
	x, size = self.fn(self.norm(x), args, *kwargs)
	else:
	x, size = self.fn(x, args, *kwargs)

	if self.drop_path:
	x = self.drop_path(x)

	x = shortcut + x

	return x, size


	class Mlp(nn.Module):
	def __init__(
	self,
	in_features,
	hidden_features=None,
	out_features=None,
	act_layer=nn.GELU,
	):
	super().__init__()
	out_features = out_features or in_features
	hidden_features = hidden_features or in_features
	self.net = nn.Sequential(OrderedDict([
	("fc1", nn.Linear(in_features, hidden_features)),
	("act", act_layer()),
	("fc2", nn.Linear(hidden_features, out_features))
	]))

	def forward(self, x, size):
	return self.net(x), size


	class DepthWiseConv2d(nn.Module):
	def __init__(
	self,
	dim_in,
	kernel_size,
	padding,
	stride,
	bias=True,
	):
	super().__init__()
	self.dw = nn.Conv2d(
	dim_in, dim_in,
	kernel_size=kernel_size,
	padding=padding,
	groups=dim_in,
	stride=stride,
	bias=bias
	)

	def forward(self, x, size):
	B, N, C = x.shape
	H, W = size
	assert N == H * W

	x = self.dw(x.transpose(1, 2).view(B, C, H, W))
	size = (x.size(-2), x.size(-1))
	x = x.flatten(2).transpose(1, 2)
	return x, size


	class ConvEmbed(nn.Module):
	""" Image to Patch Embedding
	"""

	def __init__(
	self,
	patch_size=7,
	in_chans=3,
	embed_dim=64,
	stride=4,
	padding=2,
	norm_layer=None,
	pre_norm=True
	):
	super().__init__()
	self.patch_size = patch_size

	self.proj = nn.Conv2d(
	in_chans, embed_dim,
	kernel_size=patch_size,
	stride=stride,
	padding=padding
	)

	dim_norm = in_chans if pre_norm else embed_dim
	self.norm = norm_layer(dim_norm) if norm_layer else None

	self.pre_norm = pre_norm

	def forward(self, x, size):
	H, W = size
	if len(x.size()) == 3:
	if self.norm and self.pre_norm:
	x = self.norm(x)
	x = rearrange(
	x, 'b (h w) c -> b c h w',
	h=H, w=W
	)

	x = self.proj(x)

	_, _, H, W = x.shape
	x = rearrange(x, 'b c h w -> b (h w) c')
	if self.norm and not self.pre_norm:
	x = self.norm(x)

	return x, (H, W)


	class ChannelAttention(nn.Module):

	def __init__(self, dim, groups=8, qkv_bias=True):
	super().__init__()

	self.groups = groups
	self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
	self.proj = nn.Linear(dim, dim)

	def forward(self, x, size):
	B, N, C = x.shape

	qkv = self.qkv(x).reshape(B, N, 3, self.groups, C // self.groups).permute(2, 0, 3, 1, 4)
	q, k, v = qkv[0], qkv[1], qkv[2]

	q = q * (N ** -0.5)
	attention = q.transpose(-1, -2) @ k
	attention = attention.softmax(dim=-1)
	x = (attention @ v.transpose(-1, -2)).transpose(-1, -2)
	x = x.transpose(1, 2).reshape(B, N, C)
	x = self.proj(x)
	return x, size


	class ChannelBlock(nn.Module):

	def __init__(self, dim, groups, mlp_ratio=4., qkv_bias=True,
	drop_path_rate=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm,
	conv_at_attn=True, conv_at_ffn=True):
	super().__init__()

	drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()

	self.conv1 = PreNorm(None, DepthWiseConv2d(dim, 3, 1, 1)) if conv_at_attn else None
	self.channel_attn = PreNorm(
	norm_layer(dim),
	ChannelAttention(dim, groups=groups, qkv_bias=qkv_bias),
	drop_path
	)
	self.conv2 = PreNorm(None, DepthWiseConv2d(dim, 3, 1, 1)) if conv_at_ffn else None
	self.ffn = PreNorm(
	norm_layer(dim),
	Mlp(in_features=dim, hidden_features=int(dim*mlp_ratio), act_layer=act_layer),
	drop_path
	)

	def forward(self, x, size):
	if self.conv1:
	x, size = self.conv1(x, size)
	x, size = self.channel_attn(x, size)

	if self.conv2:
	x, size = self.conv2(x, size)
	x, size = self.ffn(x, size)

	return x, size


	def window_partition(x, window_size: int):
	B, H, W, C = x.shape
	x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
	windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
	return windows


	def window_reverse(windows, window_size: int, H: int, W: int):
	B = int(windows.shape[0] / (H * W / window_size / window_size))
	x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
	x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
	return x


	class WindowAttention(nn.Module):
	def __init__(self, dim, num_heads, window_size, qkv_bias=True):

	super().__init__()
	self.dim = dim
	self.window_size = window_size
	self.num_heads = num_heads
	head_dim = dim // num_heads
	self.scale = head_dim ** -0.5

	self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
	self.proj = nn.Linear(dim, dim)

	self.softmax = nn.Softmax(dim=-1)

	def forward(self, x, size):

	H, W = size
	B, L, C = x.shape
	assert L == H * W, "input feature has wrong size"

	x = x.view(B, H, W, C)

	pad_l = pad_t = 0
	pad_r = (self.window_size - W % self.window_size) % self.window_size
	pad_b = (self.window_size - H % self.window_size) % self.window_size
	x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
	_, Hp, Wp, _ = x.shape

	x = window_partition(x, self.window_size)
	x = x.view(-1, self.window_size * self.window_size, C)

	# W-MSA/SW-MSA
	# attn_windows = self.attn(x_windows)

	B_, N, C = x.shape
	qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
	q, k, v = qkv[0], qkv[1], qkv[2]

	q = q * self.scale
	attn = (q @ k.transpose(-2, -1))
	attn = self.softmax(attn)

	x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
	x = self.proj(x)

	# merge windows
	x = x.view(
	-1, self.window_size, self.window_size, C
	)
	x = window_reverse(x, self.window_size, Hp, Wp)

	if pad_r > 0 or pad_b > 0:
	x = x[:, :H, :W, :].contiguous()

	x = x.view(B, H * W, C)

	return x, size


	class SpatialBlock(nn.Module):

	def __init__(self, dim, num_heads, window_size,
	mlp_ratio=4., qkv_bias=True, drop_path_rate=0., act_layer=nn.GELU,
	norm_layer=nn.LayerNorm, conv_at_attn=True, conv_at_ffn=True):
	super().__init__()

	drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()

	self.conv1 = PreNorm(None, DepthWiseConv2d(dim, 3, 1, 1)) if conv_at_attn else None
	self.window_attn = PreNorm(
	norm_layer(dim),
	WindowAttention(dim, num_heads, window_size, qkv_bias=qkv_bias),
	drop_path
	)
	self.conv2 = PreNorm(None, DepthWiseConv2d(dim, 3, 1, 1)) if conv_at_ffn else None
	self.ffn = PreNorm(
	norm_layer(dim),
	Mlp(in_features=dim, hidden_features=int(dim*mlp_ratio), act_layer=act_layer),
	drop_path
	)

	def forward(self, x, size):
	if self.conv1:
	x, size = self.conv1(x, size)
	x, size = self.window_attn(x, size)

	if self.conv2:
	x, size = self.conv2(x, size)
	x, size = self.ffn(x, size)
	return x, size


	class DaViT(nn.Module):
	""" DaViT: Dual-Attention Transformer

	Args:
	img_size (int): Image size, Default: 224.
	in_chans (int): Number of input image channels. Default: 3.
	num_classes (int): Number of classes for classification head. Default: 1000.
	patch_size (tuple(int)): Patch size of convolution in different stages. Default: (7, 2, 2, 2).
	patch_stride (tuple(int)): Patch stride of convolution in different stages. Default: (4, 2, 2, 2).
	patch_padding (tuple(int)): Patch padding of convolution in different stages. Default: (3, 0, 0, 0).
	patch_prenorm (tuple(bool)): If True, perform norm before convlution layer. Default: (True, False, False, False).
	embed_dims (tuple(int)): Patch embedding dimension in different stages. Default: (64, 128, 192, 256).
	num_heads (tuple(int)): Number of spatial attention heads in different stages. Default: (4, 8, 12, 16).
	num_groups (tuple(int)): Number of channel groups in different stages. Default: (4, 8, 12, 16).
	window_size (int): Window size. Default: 7.
	mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
	qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True.
	drop_path_rate (float): Stochastic depth rate. Default: 0.1.
	norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
	enable_checkpoint (bool): If True, enable checkpointing. Default: False.
	conv_at_attn (bool): If True, performe depthwise convolution before attention layer. Default: True.
	conv_at_ffn (bool): If True, performe depthwise convolution before ffn layer. Default: True.
	"""

	def __init__(
	self,
	img_size=224,
	in_chans=3,
	num_classes=1000,
	depths=(1, 1, 3, 1),
	patch_size=(7, 2, 2, 2),
	patch_stride=(4, 2, 2, 2),
	patch_padding=(3, 0, 0, 0),
	patch_prenorm=(False, False, False, False),
	embed_dims=(64, 128, 192, 256),
	num_heads=(3, 6, 12, 24),
	num_groups=(3, 6, 12, 24),
	window_size=7,
	mlp_ratio=4.,
	qkv_bias=True,
	drop_path_rate=0.1,
	norm_layer=nn.LayerNorm,
	enable_checkpoint=False,
	conv_at_attn=True,
	conv_at_ffn=True,
	out_indices=[],
	):
	super().__init__()

	self.num_classes = num_classes
	self.embed_dims = embed_dims
	self.num_heads = num_heads
	self.num_groups = num_groups
	self.num_stages = len(self.embed_dims)
	self.enable_checkpoint = enable_checkpoint
	assert self.num_stages == len(self.num_heads) == len(self.num_groups)

	num_stages = len(embed_dims)
	self.img_size = img_size
	dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths)*2)]


	depth_offset = 0
	convs = []
	blocks = []
	for i in range(num_stages):
	conv_embed = ConvEmbed(
	patch_size=patch_size[i],
	stride=patch_stride[i],
	padding=patch_padding[i],
	in_chans=in_chans if i == 0 else self.embed_dims[i - 1],
	embed_dim=self.embed_dims[i],
	norm_layer=norm_layer,
	pre_norm=patch_prenorm[i]
	)
	convs.append(conv_embed)

	print(f'=> Depth offset in stage {i}: {depth_offset}')
	block = MySequential(
	*[
	MySequential(OrderedDict([
	(
	'spatial_block', SpatialBlock(
	embed_dims[i],
	num_heads[i],
	window_size,
	drop_path_rate=dpr[depth_offset+j*2],
	qkv_bias=qkv_bias,
	mlp_ratio=mlp_ratio,
	conv_at_attn=conv_at_attn,
	conv_at_ffn=conv_at_ffn,
	)
	),
	(
	'channel_block', ChannelBlock(
	embed_dims[i],
	num_groups[i],
	drop_path_rate=dpr[depth_offset+j*2+1],
	qkv_bias=qkv_bias,
	mlp_ratio=mlp_ratio,
	conv_at_attn=conv_at_attn,
	conv_at_ffn=conv_at_ffn,
	)
	)
	])) for j in range(depths[i])
	]
	)
	blocks.append(block)
	depth_offset += depths[i]*2

	self.convs = nn.ModuleList(convs)
	self.blocks = nn.ModuleList(blocks)

	self.out_indices = out_indices
	# self.norms = norm_layer(self.embed_dims[-1])
	# self.avgpool = nn.AdaptiveAvgPool1d(1)
	# self.head = nn.Linear(self.embed_dims[-1], num_classes) if num_classes > 0 else nn.Identity()
	self.apply(self._init_weights)

	@property
	def dim_out(self):
	return self.embed_dims[-1]

	def _init_weights(self, m):
	if isinstance(m, nn.Linear):
	trunc_normal_(m.weight, std=0.02)
	if m.bias is not None:
	nn.init.constant_(m.bias, 0)
	elif isinstance(m, nn.Conv2d):
	nn.init.normal_(m.weight, std=0.02)
	for name, _ in m.named_parameters():
	if name in ['bias']:
	nn.init.constant_(m.bias, 0)
	elif isinstance(m, nn.LayerNorm):
	nn.init.constant_(m.weight, 1.0)
	nn.init.constant_(m.bias, 0)
	elif isinstance(m, nn.BatchNorm2d):
	nn.init.constant_(m.weight, 1.0)
	nn.init.constant_(m.bias, 0)

	def _try_remap_keys(self, pretrained_dict):
	remap_keys = {
	"conv_embeds": "convs",
	"main_blocks": "blocks",
	"0.cpe.0.proj": "spatial_block.conv1.fn.dw",
	"0.attn": "spatial_block.window_attn.fn",
	"0.cpe.1.proj": "spatial_block.conv2.fn.dw",
	"0.mlp": "spatial_block.ffn.fn.net",
	"1.cpe.0.proj": "channel_block.conv1.fn.dw",
	"1.attn": "channel_block.channel_attn.fn",
	"1.cpe.1.proj": "channel_block.conv2.fn.dw",
	"1.mlp": "channel_block.ffn.fn.net",
	"0.norm1": "spatial_block.window_attn.norm",
	"0.norm2": "spatial_block.ffn.norm",
	"1.norm1": "channel_block.channel_attn.norm",
	"1.norm2": "channel_block.ffn.norm"
	}

	full_key_mappings = {}
	for k in pretrained_dict.keys():
	old_k = k
	for remap_key in remap_keys.keys():
	if remap_key in k:
	print(f'=> Repace {remap_key} with {remap_keys[remap_key]}')
	k = k.replace(remap_key, remap_keys[remap_key])

	full_key_mappings[old_k] = k

	return full_key_mappings

	def from_state_dict(self, pretrained_dict, pretrained_layers=[], verbose=True):
	model_dict = self.state_dict()
	stripped_key = lambda x: x[14:] if x.startswith('image_encoder.') else x
	full_key_mappings = self._try_remap_keys(pretrained_dict)

	pretrained_dict = {
	stripped_key(full_key_mappings[k]): v for k, v in pretrained_dict.items()
	if stripped_key(full_key_mappings[k]) in model_dict.keys()
	}
	need_init_state_dict = {}
	for k, v in pretrained_dict.items():
	need_init = (
	k.split('.')[0] in pretrained_layers
	or pretrained_layers[0] == '*'
	)
	if need_init:
	if verbose:
	print(f'=> init {k} from pretrained state dict')

	need_init_state_dict[k] = v
	self.load_state_dict(need_init_state_dict, strict=False)

	def from_pretrained(self, pretrained='', pretrained_layers=[], verbose=True):
	if os.path.isfile(pretrained):
	print(f'=> loading pretrained model {pretrained}')
	pretrained_dict = torch.load(pretrained, map_location='cpu')

	self.from_state_dict(pretrained_dict, pretrained_layers, verbose)

	def forward_features(self, x):
	input_size = (x.size(2), x.size(3))

	outs = {}
	for i, (conv, block) in enumerate(zip(self.convs, self.blocks)):
	x, input_size = conv(x, input_size)
	if self.enable_checkpoint:
	x, input_size = checkpoint.checkpoint(block, x, input_size)
	else:
	x, input_size = block(x, input_size)
	if i in self.out_indices:
	out = x.view(-1, *input_size, self.embed_dims[i]).permute(0, 3, 1, 2).contiguous()
	outs["res{}".format(i + 2)] = out

	if len(self.out_indices) == 0:
	outs["res5"] = x.view(-1, *input_size, self.embed_dims[-1]).permute(0, 3, 1, 2).contiguous()

	return outs

	def forward(self, x):
	x = self.forward_features(x)
	# x = self.head(x)
	return x

	class D2DaViT(DaViT, Backbone):
	def __init__(self, cfg, input_shape):

	spec = cfg['BACKBONE']['DAVIT']

	super().__init__(
	num_classes=0,
	depths=spec['DEPTHS'],
	embed_dims=spec['DIM_EMBED'],
	num_heads=spec['NUM_HEADS'],
	num_groups=spec['NUM_GROUPS'],
	patch_size=spec['PATCH_SIZE'],
	patch_stride=spec['PATCH_STRIDE'],
	patch_padding=spec['PATCH_PADDING'],
	patch_prenorm=spec['PATCH_PRENORM'],
	drop_path_rate=spec['DROP_PATH_RATE'],
	img_size=input_shape,
	window_size=spec.get('WINDOW_SIZE', 7),
	enable_checkpoint=spec.get('ENABLE_CHECKPOINT', False),
	conv_at_attn=spec.get('CONV_AT_ATTN', True),
	conv_at_ffn=spec.get('CONV_AT_FFN', True),
	out_indices=spec.get('OUT_INDICES', []),
	)

	self._out_features = cfg['BACKBONE']['DAVIT']['OUT_FEATURES']

	self._out_feature_strides = {
	"res2": 4,
	"res3": 8,
	"res4": 16,
	"res5": 32,
	}
	self._out_feature_channels = {
	"res2": self.embed_dims[0],
	"res3": self.embed_dims[1],
	"res4": self.embed_dims[2],
	"res5": self.embed_dims[3],
	}

	def forward(self, x):
	"""
	Args:
	x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``.
	Returns:
	dict[str->Tensor]: names and the corresponding features
	"""
	assert (
	x.dim() == 4
	), f"SwinTransformer takes an input of shape (N, C, H, W). Got {x.shape} instead!"
	outputs = {}
	y = super().forward(x)

	for k in y.keys():
	if k in self._out_features:
	outputs[k] = y[k]
	return outputs

	def output_shape(self):
	return {
	name: ShapeSpec(
	channels=self._out_feature_channels[name], stride=self._out_feature_strides[name]
	)
	for name in self._out_features
	}

	@property
	def size_divisibility(self):
	return 32

	@register_backbone
	def get_davit_backbone(cfg):
	davit = D2DaViT(cfg['MODEL'], 224)

	if cfg['MODEL']['BACKBONE']['LOAD_PRETRAINED'] is True:
	filename = cfg['MODEL']['BACKBONE']['PRETRAINED']
	logger.info(f'=> init from {filename}')
	davit.from_pretrained(
	filename,
	cfg['MODEL']['BACKBONE']['DAVIT'].get('PRETRAINED_LAYERS', ['*']),
	cfg['VERBOSE'])

	return davit