Spaces:
Running
on
Zero
Running
on
Zero
import torch | |
import torch.nn as nn | |
import torch.nn.functional as F | |
from functools import partial | |
class MaskExtractor(nn.Module): | |
def __init__(self, config, mm_hidden_size, depth=2): | |
super(MaskExtractor, self).__init__() | |
self.mask_pooling = MaskPooling() | |
modules = [nn.Linear(mm_hidden_size, config.hidden_size)] | |
for _ in range(1, depth): | |
modules.append(nn.GELU()) | |
modules.append(nn.Linear(config.hidden_size, config.hidden_size)) | |
self.feat_linear = nn.Sequential(*modules) | |
def forward(self, feats, masks): | |
query_feats = [] | |
if masks is None: #infer | |
return None | |
# masks = torch.zeros((1, 1, 336, 336)).to(feats.device).float() | |
num_imgs = len(masks) | |
region_token_nums = [] | |
image_idx = 0 | |
for idx in range(num_imgs): | |
if masks[idx]==None: | |
continue | |
for mask_idx in range(len(masks[idx])): | |
mask = masks[idx][mask_idx].unsqueeze(0).unsqueeze(0).float() | |
if len(mask[0])==0: | |
print('mask error') | |
mask = torch.zeros((1, 1, 336, 336)).to(feats.device).float() | |
feat = feats[image_idx].unsqueeze(0) | |
image_idx+=1 | |
# h, w = feat.shape[1:3] | |
feat = feat.permute(0,3,1,2) | |
raw_dtype = feat.dtype | |
feat = feat.to(mask.dtype) | |
mask_feat_raw = self.mask_pooling(feat, mask) # [n, 1024] | |
query_feats.append(mask_feat_raw) | |
if len(query_feats)==0: | |
return None | |
mask_feats = torch.cat(query_feats, dim=0) | |
mask_feats = mask_feats.to(feats[0].dtype) | |
mask_feats_linear = self.feat_linear(mask_feats) | |
return mask_feats_linear | |
def kmeans_fast(tokens, num_clusters=10, num_iterations=20): | |
# tokens: 输入的token数据,shape为[n, d] | |
# num_clusters: 压缩后的组数 | |
# num_iterations: K-means算法的迭代次数 | |
# 初始化中心点 | |
n, d = tokens.shape | |
centroids = tokens[torch.randperm(n)[:num_clusters]] | |
for _ in range(num_iterations): | |
# 扩展tokens和centroids维度以计算距离,避免显式循环 | |
tokens_expand = tokens.unsqueeze(1) # [n, 1, d] | |
centroids_expand = centroids.unsqueeze(0) # [1, num_clusters, d] | |
# 计算每个token到各个中心点的距离 | |
distances = torch.sum((tokens_expand - centroids_expand) ** 2, dim=2) # [n, num_clusters] | |
# 找到每个token最近的中心点 | |
labels = torch.argmin(distances, dim=1) # [n] | |
# 计算新的中心点 | |
new_centroids = torch.stack([tokens[labels == i].mean(dim=0) if tokens[labels == i].size(0) > 0 else centroids[i] for i in range(num_clusters)]) | |
# 检查是否收敛 | |
if torch.allclose(centroids, new_centroids, atol=1e-6): | |
break | |
centroids = new_centroids | |
return centroids | |
class MaskPooling(nn.Module): | |
def __init__(self): | |
super().__init__() | |
def forward(self, x, mask): | |
if not x.shape[-2:] == mask.shape[-2:]: | |
# reshape mask to x | |
x = F.interpolate(x, size=mask.shape[-2:], mode='bilinear', align_corners=False) | |
# mask = F.interpolate(mask, size=x.shape[-2:], mode='bilinear', align_corners=False) | |
if not x.device == mask.device: | |
mask = mask.to(x.device) | |
# b, c, h ,w = x.shape | |
# b, q, h, w = mask.shape | |
mask = (mask > 0).to(mask.dtype) | |
mask = mask.permute(1,0,2,3) | |
denorm = mask.sum(dim=(-1, -2), keepdim=True) + 1e-8 | |
mask_emb = x * mask | |
mask = torch.any(mask_emb != 0, dim=(0, 1)) | |
mask_emb = mask_emb[:,:, mask] | |
mask_embedding = mask_emb[0].permute(1,0) | |
if len(mask_embedding)>10: #FIXME | |
mask_embedding = kmeans_fast(mask_embedding) | |
return mask_embedding | |
def build_region_encoder(config, mm_hidden_size): | |
return MaskExtractor(config, mm_hidden_size) | |