update

Files changed (3) hide show

README.md +2 -0
model.config.json +316 -0
training_log.txt +0 -0

README.md CHANGED Viewed

@@ -1,3 +1,5 @@
 ---
 license: mit
 ---

 ---
 license: mit
 ---
+The BIP3D detection model is based on Grounding-DINO Tiny and was trained on EmbodiedScan V1.

model.config.json ADDED Viewed

	@@ -0,0 +1,316 @@

+{
+    "__config_type__": "robo_orchard_lab.models.bip3d.structure:BIP3DConfig",
+    "class_type": "robo_orchard_lab.models.bip3d.structure:BIP3D",
+    "backbone": {
+        "type": "robo_orchard_lab.models.modules.swin_transformer:SwinTransformer",
+        "embed_dims": 96,
+        "depths": [
+            2,
+            2,
+            6,
+            2
+        ],
+        "num_heads": [
+            3,
+            6,
+            12,
+            24
+        ],
+        "window_size": 7,
+        "mlp_ratio": 4,
+        "qkv_bias": true,
+        "qk_scale": null,
+        "drop_rate": 0.0,
+        "attn_drop_rate": 0.0,
+        "out_indices": [
+            1,
+            2,
+            3
+        ],
+        "with_cp": true,
+        "convert_weights": false
+    },
+    "decoder": {
+        "type": "robo_orchard_lab.models.bip3d.grounding_decoder.bbox3d_decoder:BBox3DDecoder",
+        "look_forward_twice": true,
+        "instance_bank": {
+            "type": "robo_orchard_lab.models.bip3d.grounding_decoder.instance_bank:InstanceBank",
+            "num_anchor": 50,
+            "anchor": "./anchor_files/embodiedscan_kmeans_det_cam_log_z-0.2-3.npy",
+            "embed_dims": 256,
+            "anchor_in_camera": true
+        },
+        "anchor_encoder": {
+            "type": "robo_orchard_lab.models.bip3d.grounding_decoder.bbox3d_decoder:DoF9BoxEncoder",
+            "embed_dims": 256,
+            "rot_dims": 3
+        },
+        "graph_model": {
+            "type": "robo_orchard_lab.models.layers.transformer_layers:MultiheadAttention",
+            "embed_dims": 256,
+            "num_heads": 8,
+            "batch_first": true
+        },
+        "ffn": {
+            "type": "robo_orchard_lab.models.layers.transformer_layers:FFN",
+            "embed_dims": 256,
+            "feedforward_channels": 2048,
+            "ffn_drop": 0.0
+        },
+        "norm_layer": {
+            "type": "torch.nn.modules.normalization:LayerNorm",
+            "normalized_shape": 256
+        },
+        "deformable_model": {
+            "type": "robo_orchard_lab.models.bip3d.grounding_decoder.deformable_aggregation:DeformableFeatureAggregation",
+            "embed_dims": 256,
+            "num_groups": 8,
+            "num_levels": 4,
+            "use_camera_embed": true,
+            "with_depth": true,
+            "min_depth": 0.25,
+            "max_depth": 10,
+            "kps_generator": {
+                "type": "robo_orchard_lab.models.bip3d.grounding_decoder.bbox3d_decoder:SparseBox3DKeyPointsGenerator",
+                "fix_scale": [
+                    [
+                        0,
+                        0,
+                        0
+                    ],
+                    [
+                        0.45,
+                        0,
+                        0
+                    ],
+                    [
+                        -0.45,
+                        0,
+                        0
+                    ],
+                    [
+                        0,
+                        0.45,
+                        0
+                    ],
+                    [
+                        0,
+                        -0.45,
+                        0
+                    ],
+                    [
+                        0,
+                        0,
+                        0.45
+                    ],
+                    [
+                        0,
+                        0,
+                        -0.45
+                    ]
+                ],
+                "num_learnable_pts": 9
+            },
+            "with_value_proj": true,
+            "filter_outlier": true
+        },
+        "text_cross_attn": {
+            "type": "robo_orchard_lab.models.layers.transformer_layers:MultiheadAttention",
+            "embed_dims": 256,
+            "num_heads": 8,
+            "batch_first": true
+        },
+        "refine_layer": {
+            "type": "robo_orchard_lab.models.bip3d.grounding_decoder.bbox3d_decoder:GroundingRefineClsHead",
+            "embed_dims": 256,
+            "output_dim": 9,
+            "cls_bias": true
+        },
+        "loss_cls": {
+            "type": "robo_orchard_lab.models.bip3d.grounding_decoder.bbox3d_decoder:FocalLoss",
+            "use_sigmoid": true,
+            "gamma": 2.0,
+            "alpha": 0.25,
+            "loss_weight": 1.0
+        },
+        "loss_reg": {
+            "type": "robo_orchard_lab.models.bip3d.grounding_decoder.bbox3d_decoder:DoF9BoxLoss",
+            "loss_weight_wd": 1.0,
+            "loss_weight_cd": 0.8
+        },
+        "sampler": {
+            "type": "robo_orchard_lab.models.bip3d.grounding_decoder.target:Grounding3DTarget",
+            "cls_weight": 1.0,
+            "box_weight": 1.0,
+            "num_dn": 100,
+            "cost_weight_wd": 1.0,
+            "cost_weight_cd": 0.8,
+            "with_dn_query": true,
+            "num_classes": 284,
+            "embed_dims": 256
+        },
+        "gt_reg_key": "gt_bboxes_3d",
+        "gt_cls_key": "tokens_positive",
+        "post_processor": {
+            "type": "robo_orchard_lab.models.bip3d.grounding_decoder.bbox3d_decoder:GroundingBox3DPostProcess",
+            "num_output": 1000
+        }
+    },
+    "neck": {
+        "type": "robo_orchard_lab.models.modules.channel_mapper:ChannelMapper",
+        "in_channels": [
+            192,
+            384,
+            768
+        ],
+        "kernel_size": 1,
+        "out_channels": 256,
+        "act_cfg": null,
+        "bias": true,
+        "norm_cfg": {
+            "type": "torch.nn.modules.normalization:GroupNorm",
+            "num_groups": 32
+        },
+        "num_outs": 4
+    },
+    "text_encoder": {
+        "type": "robo_orchard_lab.models.bip3d.bert:BertModel",
+        "special_tokens_list": [
+            "[CLS]",
+            "[SEP]"
+        ],
+        "name": "./ckpt/bert-base-uncased",
+        "pad_to_max": false,
+        "use_sub_sentence_represent": true,
+        "add_pooling_layer": false,
+        "max_tokens": 768,
+        "use_checkpoint": true,
+        "return_tokenized": true
+    },
+    "feature_enhancer": {
+        "type": "robo_orchard_lab.models.bip3d.feature_enhancer:TextImageDeformable2DEnhancer",
+        "embed_dims": 256,
+        "num_layers": 6,
+        "text_img_attn_block": {
+            "v_dim": 256,
+            "l_dim": 256,
+            "embed_dim": 1024,
+            "num_heads": 4,
+            "init_values": 0.0001
+        },
+        "img_attn_block": {
+            "self_attn_cfg": {
+                "embed_dims": 256,
+                "num_levels": 4,
+                "im2col_step": 1
+            },
+            "ffn_cfg": {
+                "embed_dims": 256,
+                "feedforward_channels": 2048,
+                "ffn_drop": 0.0
+            }
+        },
+        "text_attn_block": {
+            "self_attn_cfg": {
+                "num_heads": 4,
+                "embed_dims": 256
+            },
+            "ffn_cfg": {
+                "embed_dims": 256,
+                "feedforward_channels": 1024,
+                "ffn_drop": 0.0
+            }
+        },
+        "num_feature_levels": 4,
+        "positional_encoding": {
+            "num_feats": 128,
+            "normalize": true,
+            "offset": 0.0,
+            "temperature": 20
+        }
+    },
+    "spatial_enhancer": {
+        "type": "robo_orchard_lab.models.bip3d.spatial_enhancer:DepthFusionSpatialEnhancer",
+        "embed_dims": 256,
+        "feature_3d_dim": 32,
+        "num_depth_layers": 2,
+        "min_depth": 0.25,
+        "max_depth": 10,
+        "num_depth": 64,
+        "with_feature_3d": true,
+        "loss_depth_weight": 1.0
+    },
+    "data_preprocessor": {
+        "type": "robo_orchard_lab.models.layers.data_preprocessors:BaseDataPreprocessor",
+        "mean": [
+            123.675,
+            116.28,
+            103.53
+        ],
+        "std": [
+            58.395,
+            57.12,
+            57.375
+        ],
+        "channel_flip": true,
+        "batch_transforms": [
+            {
+                "type": "robo_orchard_lab.models.bip3d.spatial_enhancer:BatchDepthProbGTGenerator",
+                "min_depth": 0.25,
+                "max_depth": 10,
+                "num_depth": 64,
+                "origin_stride": 4,
+                "valid_threshold": 0.0,
+                "stride": [
+                    8,
+                    16,
+                    32,
+                    64
+                ]
+            },
+            {
+                "type": "robo_orchard_lab.models.layers.data_preprocessors:GridMask",
+                "apply_grid_mask_keys": [
+                    "imgs",
+                    "depths"
+                ]
+            }
+        ]
+    },
+    "backbone_3d": {
+        "type": "robo_orchard_lab.models.modules.resnet:ResNet",
+        "depth": 34,
+        "in_channels": 1,
+        "base_channels": 4,
+        "num_stages": 4,
+        "out_indices": [
+            1,
+            2,
+            3
+        ],
+        "bn_eval": true,
+        "with_cp": true,
+        "style": "pytorch"
+    },
+    "neck_3d": {
+        "type": "robo_orchard_lab.models.modules.channel_mapper:ChannelMapper",
+        "in_channels": [
+            8,
+            16,
+            32
+        ],
+        "kernel_size": 1,
+        "out_channels": 32,
+        "act_cfg": null,
+        "bias": true,
+        "norm_cfg": {
+            "type": "torch.nn.modules.normalization:GroupNorm",
+            "num_groups": 4
+        },
+        "num_outs": 4
+    },
+    "input_2d": "imgs",
+    "input_3d": "depths",
+    "embed_dims": 256,
+    "pre_spatial_enhancer": false
+}

training_log.txt ADDED Viewed

The diff for this file is too large to render. See raw diff