Tiny dummy models
Collection
Randomly initialized tiny models for debugging/testing purpose
•
141 items
•
Updated
•
6
This tiny model is intended for debugging. It is randomly initialized using the configuration adapted from facebook/sam3.
import requests
import torch
from PIL import Image
from transformers import Sam3Model, Sam3Processor
from transformers.models.sam3.modeling_sam3 import Sam3Config
model_id = "yujiepan/sam3-tiny-random"
device = "cuda" if torch.cuda.is_available() else "cpu"
model = Sam3Model.from_pretrained(model_id).to(device)
processor = Sam3Processor.from_pretrained(model_id)
kitchen_url = "http://images.cocodataset.org/val2017/000000136466.jpg"
kitchen_image = Image.open(requests.get(
kitchen_url, stream=True).raw).convert("RGB")
# Segment "handle" but exclude the oven handle using a negative box
text = "handle"
# Negative box covering oven handle area (xyxy): [40, 183, 318, 204]
oven_handle_box = [40, 183, 318, 204]
input_boxes = [[oven_handle_box]]
inputs = processor(
images=kitchen_image,
text=text,
input_boxes=input_boxes,
input_boxes_labels=[[0]], # 0 = negative (exclude this region)
return_tensors="pt"
).to(device)
with torch.no_grad():
outputs = model(**inputs)
# Post-process results
results = processor.post_process_instance_segmentation(
outputs,
threshold=0.5,
mask_threshold=0.5,
target_sizes=inputs.get("original_sizes").tolist()
)[0]
print(results)
# This will segment pot handles but exclude the oven handle
import json
from pathlib import Path
import accelerate
import torch
from huggingface_hub import file_exists, hf_hub_download
from transformers import (
AutoConfig,
AutoModelForCausalLM,
AutoProcessor,
GenerationConfig,
Sam3Processor,
set_seed,
)
from transformers.models.sam3.modeling_sam3 import Sam3Config, Sam3Model
source_model_id = "facebook/sam3"
save_folder = "/tmp/yujiepan/sam3-tiny-random"
processor = Sam3Processor.from_pretrained(
source_model_id, trust_remote_code=True)
processor.save_pretrained(save_folder)
with open(hf_hub_download(source_model_id, filename='config.json', repo_type='model'), 'r', encoding='utf-8') as f:
config_json = json.load(f)
HIDDEN_SIZE = 16
INTERMEDIATE_SIZE = 32
NUM_ATTENTION_HEADS = 2
config_json['detector_config']['detr_decoder_config'].update({
'hidden_size': HIDDEN_SIZE,
'intermediate_size': INTERMEDIATE_SIZE,
'num_attention_heads': NUM_ATTENTION_HEADS,
})
config_json['detector_config']['detr_encoder_config'].update({
'hidden_size': HIDDEN_SIZE,
'intermediate_size': INTERMEDIATE_SIZE,
'num_attention_heads': NUM_ATTENTION_HEADS,
})
config_json['detector_config']['geometry_encoder_config'].update({
'hidden_size': HIDDEN_SIZE,
'intermediate_size': INTERMEDIATE_SIZE,
'num_attention_heads': NUM_ATTENTION_HEADS,
})
config_json['detector_config']['mask_decoder_config'].update({
'hidden_size': HIDDEN_SIZE,
'intermediate_size': INTERMEDIATE_SIZE,
'num_attention_heads': NUM_ATTENTION_HEADS,
})
config_json['detector_config']['text_config'].update({
'hidden_size': HIDDEN_SIZE,
'intermediate_size': INTERMEDIATE_SIZE,
'num_attention_heads': NUM_ATTENTION_HEADS,
'projection_dim': HIDDEN_SIZE,
'num_hidden_layers': 2,
})
config_json['detector_config']['vision_config']['backbone_config'].update({
'hidden_size': HIDDEN_SIZE,
'intermediate_size': INTERMEDIATE_SIZE,
'num_attention_heads': NUM_ATTENTION_HEADS,
'fpn_hidden_size': HIDDEN_SIZE,
'global_attn_indexes': [1, 3, 5, 7],
'num_hidden_layers': 8,
})
config_json['detector_config']['vision_config'].update({
'fpn_hidden_size': HIDDEN_SIZE,
})
config_json['tracker_config']['mask_decoder_config'].update({
'hidden_size': HIDDEN_SIZE,
'iou_head_hidden_dim': HIDDEN_SIZE,
'num_attention_heads': NUM_ATTENTION_HEADS,
})
config_json['tracker_config'].update({
'mask_downsampler_embed_dim': HIDDEN_SIZE,
'memory_attention_feed_forward_hidden_size': HIDDEN_SIZE,
'memory_attention_hidden_size': HIDDEN_SIZE,
'memory_encoder_hidden_size': HIDDEN_SIZE,
'memory_fuser_embed_dim': HIDDEN_SIZE,
'memory_fuser_intermediate_dim': INTERMEDIATE_SIZE,
})
config_json['tracker_config']['prompt_encoder_config'].update({
'hidden_size': HIDDEN_SIZE,
'intermediate_size': INTERMEDIATE_SIZE,
'num_attention_heads': NUM_ATTENTION_HEADS,
})
config_json['tracker_config']['vision_config']['backbone_config'].update({
'hidden_size': HIDDEN_SIZE,
'intermediate_size': INTERMEDIATE_SIZE,
'num_attention_heads': NUM_ATTENTION_HEADS,
'global_attn_indexes': [1, 3, 5, 7],
'num_hidden_layers': 8,
})
config_json['tracker_config']['vision_config'].update({
'fpn_hidden_size': HIDDEN_SIZE,
})
with open(f"{save_folder}/config.json", "w", encoding='utf-8') as f:
json.dump(config_json, f, indent=2)
config = Sam3Config.from_pretrained(
save_folder,
trust_remote_code=True,
)
print(config)
torch.set_default_dtype(torch.float32)
model = Sam3Model(config)
set_seed(42)
model = model.cpu()
with torch.no_grad():
for name, p in sorted(model.named_parameters()):
torch.nn.init.normal_(p, 0, 0.1)
print(name, p.shape)
model.save_pretrained(save_folder)
# print(list(model.state_dict().keys()))
# there is some bug in model.save_pretrained... Re-save the model weights here.
import safetensors.torch
safetensors.torch.save_file(
tensors=model.state_dict(),
filename=f"{save_folder}/model.safetensors"
)
Sam3Model(
(vision_encoder): Sam3VisionModel(
(backbone): Sam3ViTModel(
(embeddings): Sam3ViTEmbeddings(
(patch_embeddings): Sam3ViTPatchEmbeddings(
(projection): Conv2d(3, 16, kernel_size=(14, 14), stride=(14, 14), bias=False)
)
(dropout): Dropout(p=0.0, inplace=False)
)
(layer_norm): LayerNorm((16,), eps=1e-06, elementwise_affine=True)
(layers): ModuleList(
(0-7): 8 x Sam3ViTLayer(
(layer_norm1): LayerNorm((16,), eps=1e-06, elementwise_affine=True)
(rotary_emb): Sam3ViTRotaryEmbedding()
(attention): Sam3ViTRoPEAttention(
(q_proj): Linear(in_features=16, out_features=16, bias=True)
(k_proj): Linear(in_features=16, out_features=16, bias=True)
(v_proj): Linear(in_features=16, out_features=16, bias=True)
(o_proj): Linear(in_features=16, out_features=16, bias=True)
)
(layer_norm2): LayerNorm((16,), eps=1e-06, elementwise_affine=True)
(mlp): Sam3MLP(
(activation_fn): GELUActivation()
(fc1): Linear(in_features=16, out_features=32, bias=True)
(fc2): Linear(in_features=32, out_features=16, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
)
(dropout): Dropout(p=0.0, inplace=False)
)
)
)
(neck): Sam3VisionNeck(
(position_encoding): Sam3SinePositionEmbedding()
(fpn_layers): ModuleList(
(0): Sam3FPNLayer(
(scale_layers): ModuleList(
(0): ConvTranspose2d(16, 8, kernel_size=(2, 2), stride=(2, 2))
(1): GELU(approximate='none')
(2): ConvTranspose2d(8, 4, kernel_size=(2, 2), stride=(2, 2))
)
(proj1): Conv2d(4, 16, kernel_size=(1, 1), stride=(1, 1))
(proj2): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(1): Sam3FPNLayer(
(scale_layers): ModuleList(
(0): ConvTranspose2d(16, 8, kernel_size=(2, 2), stride=(2, 2))
)
(proj1): Conv2d(8, 16, kernel_size=(1, 1), stride=(1, 1))
(proj2): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(2): Sam3FPNLayer(
(scale_layers): ModuleList()
(proj1): Conv2d(16, 16, kernel_size=(1, 1), stride=(1, 1))
(proj2): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(3): Sam3FPNLayer(
(scale_layers): ModuleList(
(0): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
)
(proj1): Conv2d(16, 16, kernel_size=(1, 1), stride=(1, 1))
(proj2): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
)
)
)
(text_encoder): CLIPTextModelWithProjection(
(text_model): CLIPTextTransformer(
(embeddings): CLIPTextEmbeddings(
(token_embedding): Embedding(49408, 16)
(position_embedding): Embedding(32, 16)
)
(encoder): CLIPEncoder(
(layers): ModuleList(
(0-1): 2 x CLIPEncoderLayer(
(self_attn): CLIPAttention(
(k_proj): Linear(in_features=16, out_features=16, bias=True)
(v_proj): Linear(in_features=16, out_features=16, bias=True)
(q_proj): Linear(in_features=16, out_features=16, bias=True)
(out_proj): Linear(in_features=16, out_features=16, bias=True)
)
(layer_norm1): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
(mlp): CLIPMLP(
(activation_fn): GELUActivation()
(fc1): Linear(in_features=16, out_features=32, bias=True)
(fc2): Linear(in_features=32, out_features=16, bias=True)
)
(layer_norm2): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
)
)
)
(final_layer_norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
)
(text_projection): Linear(in_features=16, out_features=16, bias=False)
)
(text_projection): Linear(in_features=16, out_features=16, bias=True)
(geometry_encoder): Sam3GeometryEncoder(
(position_encoding): Sam3SinePositionEmbedding()
(label_embed): Embedding(2, 16)
(cls_embed): Embedding(1, 16)
(boxes_direct_project): Linear(in_features=4, out_features=16, bias=True)
(boxes_pool_project): Conv2d(16, 16, kernel_size=(7, 7), stride=(1, 1))
(boxes_pos_enc_project): Linear(in_features=18, out_features=16, bias=True)
(vision_layer_norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
(final_proj): Linear(in_features=16, out_features=16, bias=True)
(prompt_layer_norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
(layers): ModuleList(
(0-2): 3 x Sam3GeometryEncoderLayer(
(layer_norm1): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
(self_attn): Sam3Attention(
(q_proj): Linear(in_features=16, out_features=16, bias=True)
(k_proj): Linear(in_features=16, out_features=16, bias=True)
(v_proj): Linear(in_features=16, out_features=16, bias=True)
(o_proj): Linear(in_features=16, out_features=16, bias=True)
)
(dropout): Dropout(p=0.1, inplace=False)
(cross_attn): Sam3Attention(
(q_proj): Linear(in_features=16, out_features=16, bias=True)
(k_proj): Linear(in_features=16, out_features=16, bias=True)
(v_proj): Linear(in_features=16, out_features=16, bias=True)
(o_proj): Linear(in_features=16, out_features=16, bias=True)
)
(layer_norm2): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
(mlp): Sam3MLP(
(activation_fn): ReLU()
(fc1): Linear(in_features=16, out_features=32, bias=True)
(fc2): Linear(in_features=32, out_features=16, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
)
(layer_norm3): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
)
)
(output_layer_norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
)
(detr_encoder): Sam3DetrEncoder(
(layers): ModuleList(
(0-5): 6 x Sam3DetrEncoderLayer(
(layer_norm1): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
(self_attn): Sam3Attention(
(q_proj): Linear(in_features=16, out_features=16, bias=True)
(k_proj): Linear(in_features=16, out_features=16, bias=True)
(v_proj): Linear(in_features=16, out_features=16, bias=True)
(o_proj): Linear(in_features=16, out_features=16, bias=True)
)
(dropout): Dropout(p=0.1, inplace=False)
(cross_attn): Sam3Attention(
(q_proj): Linear(in_features=16, out_features=16, bias=True)
(k_proj): Linear(in_features=16, out_features=16, bias=True)
(v_proj): Linear(in_features=16, out_features=16, bias=True)
(o_proj): Linear(in_features=16, out_features=16, bias=True)
)
(layer_norm2): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
(mlp): Sam3MLP(
(activation_fn): ReLU()
(fc1): Linear(in_features=16, out_features=32, bias=True)
(fc2): Linear(in_features=32, out_features=16, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
)
(layer_norm3): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
)
)
)
(detr_decoder): Sam3DetrDecoder(
(layers): ModuleList(
(0-5): 6 x Sam3DetrDecoderLayer(
(self_attn): Sam3Attention(
(q_proj): Linear(in_features=16, out_features=16, bias=True)
(k_proj): Linear(in_features=16, out_features=16, bias=True)
(v_proj): Linear(in_features=16, out_features=16, bias=True)
(o_proj): Linear(in_features=16, out_features=16, bias=True)
)
(self_attn_dropout): Dropout(p=0.1, inplace=False)
(self_attn_layer_norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
(text_cross_attn): Sam3Attention(
(q_proj): Linear(in_features=16, out_features=16, bias=True)
(k_proj): Linear(in_features=16, out_features=16, bias=True)
(v_proj): Linear(in_features=16, out_features=16, bias=True)
(o_proj): Linear(in_features=16, out_features=16, bias=True)
)
(text_cross_attn_dropout): Dropout(p=0.1, inplace=False)
(text_cross_attn_layer_norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
(vision_cross_attn): Sam3Attention(
(q_proj): Linear(in_features=16, out_features=16, bias=True)
(k_proj): Linear(in_features=16, out_features=16, bias=True)
(v_proj): Linear(in_features=16, out_features=16, bias=True)
(o_proj): Linear(in_features=16, out_features=16, bias=True)
)
(vision_cross_attn_dropout): Dropout(p=0.1, inplace=False)
(vision_cross_attn_layer_norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
(mlp): Sam3MLP(
(activation_fn): ReLU()
(fc1): Linear(in_features=16, out_features=32, bias=True)
(fc2): Linear(in_features=32, out_features=16, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
)
(mlp_layer_norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
(mlp_dropout): Dropout(p=0.1, inplace=False)
)
)
(output_layer_norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
(box_head): Sam3DecoderMLP(
(layer1): Linear(in_features=16, out_features=16, bias=True)
(layer2): Linear(in_features=16, out_features=16, bias=True)
(layer3): Linear(in_features=16, out_features=4, bias=True)
)
(query_embed): Embedding(200, 16)
(reference_points): Embedding(200, 4)
(presence_token): Embedding(1, 16)
(presence_head): Sam3DecoderMLP(
(layer1): Linear(in_features=16, out_features=16, bias=True)
(layer2): Linear(in_features=16, out_features=16, bias=True)
(layer3): Linear(in_features=16, out_features=1, bias=True)
)
(presence_layer_norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
(ref_point_head): Sam3DecoderMLP(
(layer1): Linear(in_features=32, out_features=16, bias=True)
(layer2): Linear(in_features=16, out_features=16, bias=True)
)
(box_rpb_embed_x): Sam3DecoderMLP(
(layer1): Linear(in_features=2, out_features=16, bias=True)
(layer2): Linear(in_features=16, out_features=2, bias=True)
)
(box_rpb_embed_y): Sam3DecoderMLP(
(layer1): Linear(in_features=2, out_features=16, bias=True)
(layer2): Linear(in_features=16, out_features=2, bias=True)
)
(position_encoding): Sam3SinePositionEmbedding()
)
(mask_decoder): Sam3MaskDecoder(
(pixel_decoder): Sam3PixelDecoder(
(conv_layers): ModuleList(
(0-2): 3 x Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
(norms): ModuleList(
(0-2): 3 x GroupNorm(8, 16, eps=1e-05, affine=True)
)
)
(mask_embedder): Sam3MaskEmbedder(
(layers): ModuleList(
(0-2): 3 x Linear(in_features=16, out_features=16, bias=True)
)
(activation): ReLU()
)
(instance_projection): Conv2d(16, 16, kernel_size=(1, 1), stride=(1, 1))
(semantic_projection): Conv2d(16, 1, kernel_size=(1, 1), stride=(1, 1))
(prompt_cross_attn): Sam3Attention(
(q_proj): Linear(in_features=16, out_features=16, bias=True)
(k_proj): Linear(in_features=16, out_features=16, bias=True)
(v_proj): Linear(in_features=16, out_features=16, bias=True)
(o_proj): Linear(in_features=16, out_features=16, bias=True)
)
(prompt_cross_attn_norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
(prompt_cross_attn_dropout): Dropout(p=0.0, inplace=False)
)
(dot_product_scoring): Sam3DotProductScoring(
(text_mlp): Sam3DecoderMLP(
(layer1): Linear(in_features=16, out_features=32, bias=True)
(layer2): Linear(in_features=32, out_features=16, bias=True)
)
(text_mlp_dropout): Dropout(p=0.1, inplace=False)
(text_mlp_out_norm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
(text_proj): Linear(in_features=16, out_features=16, bias=True)
(query_proj): Linear(in_features=16, out_features=16, bias=True)
)
)
Base model
facebook/sam3