xuewu.lin commited on
Commit
7d023fa
·
1 Parent(s): d5157a8
Files changed (3) hide show
  1. README.md +2 -0
  2. model.config.json +316 -0
  3. training_log.txt +0 -0
README.md CHANGED
@@ -1,3 +1,5 @@
1
  ---
2
  license: mit
3
  ---
 
 
 
1
  ---
2
  license: mit
3
  ---
4
+
5
+ The BIP3D detection model is based on Grounding-DINO Tiny and was trained on EmbodiedScan V1.
model.config.json ADDED
@@ -0,0 +1,316 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "__config_type__": "robo_orchard_lab.models.bip3d.structure:BIP3DConfig",
3
+ "class_type": "robo_orchard_lab.models.bip3d.structure:BIP3D",
4
+ "backbone": {
5
+ "type": "robo_orchard_lab.models.modules.swin_transformer:SwinTransformer",
6
+ "embed_dims": 96,
7
+ "depths": [
8
+ 2,
9
+ 2,
10
+ 6,
11
+ 2
12
+ ],
13
+ "num_heads": [
14
+ 3,
15
+ 6,
16
+ 12,
17
+ 24
18
+ ],
19
+ "window_size": 7,
20
+ "mlp_ratio": 4,
21
+ "qkv_bias": true,
22
+ "qk_scale": null,
23
+ "drop_rate": 0.0,
24
+ "attn_drop_rate": 0.0,
25
+ "out_indices": [
26
+ 1,
27
+ 2,
28
+ 3
29
+ ],
30
+ "with_cp": true,
31
+ "convert_weights": false
32
+ },
33
+ "decoder": {
34
+ "type": "robo_orchard_lab.models.bip3d.grounding_decoder.bbox3d_decoder:BBox3DDecoder",
35
+ "look_forward_twice": true,
36
+ "instance_bank": {
37
+ "type": "robo_orchard_lab.models.bip3d.grounding_decoder.instance_bank:InstanceBank",
38
+ "num_anchor": 50,
39
+ "anchor": "./anchor_files/embodiedscan_kmeans_det_cam_log_z-0.2-3.npy",
40
+ "embed_dims": 256,
41
+ "anchor_in_camera": true
42
+ },
43
+ "anchor_encoder": {
44
+ "type": "robo_orchard_lab.models.bip3d.grounding_decoder.bbox3d_decoder:DoF9BoxEncoder",
45
+ "embed_dims": 256,
46
+ "rot_dims": 3
47
+ },
48
+ "graph_model": {
49
+ "type": "robo_orchard_lab.models.layers.transformer_layers:MultiheadAttention",
50
+ "embed_dims": 256,
51
+ "num_heads": 8,
52
+ "batch_first": true
53
+ },
54
+ "ffn": {
55
+ "type": "robo_orchard_lab.models.layers.transformer_layers:FFN",
56
+ "embed_dims": 256,
57
+ "feedforward_channels": 2048,
58
+ "ffn_drop": 0.0
59
+ },
60
+ "norm_layer": {
61
+ "type": "torch.nn.modules.normalization:LayerNorm",
62
+ "normalized_shape": 256
63
+ },
64
+ "deformable_model": {
65
+ "type": "robo_orchard_lab.models.bip3d.grounding_decoder.deformable_aggregation:DeformableFeatureAggregation",
66
+ "embed_dims": 256,
67
+ "num_groups": 8,
68
+ "num_levels": 4,
69
+ "use_camera_embed": true,
70
+ "with_depth": true,
71
+ "min_depth": 0.25,
72
+ "max_depth": 10,
73
+ "kps_generator": {
74
+ "type": "robo_orchard_lab.models.bip3d.grounding_decoder.bbox3d_decoder:SparseBox3DKeyPointsGenerator",
75
+ "fix_scale": [
76
+ [
77
+ 0,
78
+ 0,
79
+ 0
80
+ ],
81
+ [
82
+ 0.45,
83
+ 0,
84
+ 0
85
+ ],
86
+ [
87
+ -0.45,
88
+ 0,
89
+ 0
90
+ ],
91
+ [
92
+ 0,
93
+ 0.45,
94
+ 0
95
+ ],
96
+ [
97
+ 0,
98
+ -0.45,
99
+ 0
100
+ ],
101
+ [
102
+ 0,
103
+ 0,
104
+ 0.45
105
+ ],
106
+ [
107
+ 0,
108
+ 0,
109
+ -0.45
110
+ ]
111
+ ],
112
+ "num_learnable_pts": 9
113
+ },
114
+ "with_value_proj": true,
115
+ "filter_outlier": true
116
+ },
117
+ "text_cross_attn": {
118
+ "type": "robo_orchard_lab.models.layers.transformer_layers:MultiheadAttention",
119
+ "embed_dims": 256,
120
+ "num_heads": 8,
121
+ "batch_first": true
122
+ },
123
+ "refine_layer": {
124
+ "type": "robo_orchard_lab.models.bip3d.grounding_decoder.bbox3d_decoder:GroundingRefineClsHead",
125
+ "embed_dims": 256,
126
+ "output_dim": 9,
127
+ "cls_bias": true
128
+ },
129
+ "loss_cls": {
130
+ "type": "robo_orchard_lab.models.bip3d.grounding_decoder.bbox3d_decoder:FocalLoss",
131
+ "use_sigmoid": true,
132
+ "gamma": 2.0,
133
+ "alpha": 0.25,
134
+ "loss_weight": 1.0
135
+ },
136
+ "loss_reg": {
137
+ "type": "robo_orchard_lab.models.bip3d.grounding_decoder.bbox3d_decoder:DoF9BoxLoss",
138
+ "loss_weight_wd": 1.0,
139
+ "loss_weight_cd": 0.8
140
+ },
141
+ "sampler": {
142
+ "type": "robo_orchard_lab.models.bip3d.grounding_decoder.target:Grounding3DTarget",
143
+ "cls_weight": 1.0,
144
+ "box_weight": 1.0,
145
+ "num_dn": 100,
146
+ "cost_weight_wd": 1.0,
147
+ "cost_weight_cd": 0.8,
148
+ "with_dn_query": true,
149
+ "num_classes": 284,
150
+ "embed_dims": 256
151
+ },
152
+ "gt_reg_key": "gt_bboxes_3d",
153
+ "gt_cls_key": "tokens_positive",
154
+ "post_processor": {
155
+ "type": "robo_orchard_lab.models.bip3d.grounding_decoder.bbox3d_decoder:GroundingBox3DPostProcess",
156
+ "num_output": 1000
157
+ }
158
+ },
159
+ "neck": {
160
+ "type": "robo_orchard_lab.models.modules.channel_mapper:ChannelMapper",
161
+ "in_channels": [
162
+ 192,
163
+ 384,
164
+ 768
165
+ ],
166
+ "kernel_size": 1,
167
+ "out_channels": 256,
168
+ "act_cfg": null,
169
+ "bias": true,
170
+ "norm_cfg": {
171
+ "type": "torch.nn.modules.normalization:GroupNorm",
172
+ "num_groups": 32
173
+ },
174
+ "num_outs": 4
175
+ },
176
+ "text_encoder": {
177
+ "type": "robo_orchard_lab.models.bip3d.bert:BertModel",
178
+ "special_tokens_list": [
179
+ "[CLS]",
180
+ "[SEP]"
181
+ ],
182
+ "name": "./ckpt/bert-base-uncased",
183
+ "pad_to_max": false,
184
+ "use_sub_sentence_represent": true,
185
+ "add_pooling_layer": false,
186
+ "max_tokens": 768,
187
+ "use_checkpoint": true,
188
+ "return_tokenized": true
189
+ },
190
+ "feature_enhancer": {
191
+ "type": "robo_orchard_lab.models.bip3d.feature_enhancer:TextImageDeformable2DEnhancer",
192
+ "embed_dims": 256,
193
+ "num_layers": 6,
194
+ "text_img_attn_block": {
195
+ "v_dim": 256,
196
+ "l_dim": 256,
197
+ "embed_dim": 1024,
198
+ "num_heads": 4,
199
+ "init_values": 0.0001
200
+ },
201
+ "img_attn_block": {
202
+ "self_attn_cfg": {
203
+ "embed_dims": 256,
204
+ "num_levels": 4,
205
+ "im2col_step": 1
206
+ },
207
+ "ffn_cfg": {
208
+ "embed_dims": 256,
209
+ "feedforward_channels": 2048,
210
+ "ffn_drop": 0.0
211
+ }
212
+ },
213
+ "text_attn_block": {
214
+ "self_attn_cfg": {
215
+ "num_heads": 4,
216
+ "embed_dims": 256
217
+ },
218
+ "ffn_cfg": {
219
+ "embed_dims": 256,
220
+ "feedforward_channels": 1024,
221
+ "ffn_drop": 0.0
222
+ }
223
+ },
224
+ "num_feature_levels": 4,
225
+ "positional_encoding": {
226
+ "num_feats": 128,
227
+ "normalize": true,
228
+ "offset": 0.0,
229
+ "temperature": 20
230
+ }
231
+ },
232
+ "spatial_enhancer": {
233
+ "type": "robo_orchard_lab.models.bip3d.spatial_enhancer:DepthFusionSpatialEnhancer",
234
+ "embed_dims": 256,
235
+ "feature_3d_dim": 32,
236
+ "num_depth_layers": 2,
237
+ "min_depth": 0.25,
238
+ "max_depth": 10,
239
+ "num_depth": 64,
240
+ "with_feature_3d": true,
241
+ "loss_depth_weight": 1.0
242
+ },
243
+ "data_preprocessor": {
244
+ "type": "robo_orchard_lab.models.layers.data_preprocessors:BaseDataPreprocessor",
245
+ "mean": [
246
+ 123.675,
247
+ 116.28,
248
+ 103.53
249
+ ],
250
+ "std": [
251
+ 58.395,
252
+ 57.12,
253
+ 57.375
254
+ ],
255
+ "channel_flip": true,
256
+ "batch_transforms": [
257
+ {
258
+ "type": "robo_orchard_lab.models.bip3d.spatial_enhancer:BatchDepthProbGTGenerator",
259
+ "min_depth": 0.25,
260
+ "max_depth": 10,
261
+ "num_depth": 64,
262
+ "origin_stride": 4,
263
+ "valid_threshold": 0.0,
264
+ "stride": [
265
+ 8,
266
+ 16,
267
+ 32,
268
+ 64
269
+ ]
270
+ },
271
+ {
272
+ "type": "robo_orchard_lab.models.layers.data_preprocessors:GridMask",
273
+ "apply_grid_mask_keys": [
274
+ "imgs",
275
+ "depths"
276
+ ]
277
+ }
278
+ ]
279
+ },
280
+ "backbone_3d": {
281
+ "type": "robo_orchard_lab.models.modules.resnet:ResNet",
282
+ "depth": 34,
283
+ "in_channels": 1,
284
+ "base_channels": 4,
285
+ "num_stages": 4,
286
+ "out_indices": [
287
+ 1,
288
+ 2,
289
+ 3
290
+ ],
291
+ "bn_eval": true,
292
+ "with_cp": true,
293
+ "style": "pytorch"
294
+ },
295
+ "neck_3d": {
296
+ "type": "robo_orchard_lab.models.modules.channel_mapper:ChannelMapper",
297
+ "in_channels": [
298
+ 8,
299
+ 16,
300
+ 32
301
+ ],
302
+ "kernel_size": 1,
303
+ "out_channels": 32,
304
+ "act_cfg": null,
305
+ "bias": true,
306
+ "norm_cfg": {
307
+ "type": "torch.nn.modules.normalization:GroupNorm",
308
+ "num_groups": 4
309
+ },
310
+ "num_outs": 4
311
+ },
312
+ "input_2d": "imgs",
313
+ "input_3d": "depths",
314
+ "embed_dims": 256,
315
+ "pre_spatial_enhancer": false
316
+ }
training_log.txt ADDED
The diff for this file is too large to render. See raw diff