Spaces:

wangerniu
/

VecMapLocNet

Sleeping

App Files Files Community

wangerniu commited on Jan 27

Commit

c9b5796

1 Parent(s): 1e5420b

添加必要文件

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

conf/maplocnet.yaml +105 -0
conf/maplocnetsingle-101.yaml +105 -0
conf/maplocnetsingle.yaml +105 -0
conf/maplocnetsingle0526.yaml +105 -0
conf/maplocnetsingleunet.yaml +105 -0
conf/maplocnetsinglhub_DDRNet.yaml +107 -0
conf/maplocnetsinglhub_FPN-resnet18WeightedEmbedding.yaml +112 -0
conf/maplocnetsinglhub_FPN-resnet34LightWeightedEmbedding.yaml +112 -0
conf/maplocnetsinglhub_FPN-resnet34WeightedEmbedding.yaml +112 -0
conf/maplocnetsinglhub_FPN-resnet50.yaml +111 -0
conf/maplocnetsinglhub_FPN-resnet50WeightedEmbedding.yaml +112 -0
conf/maplocnetsinglhub_FPN.yaml +107 -0
conf/maplocnetsinglhub_FPN_Mobileone.yaml +107 -0
conf/maplocnetsinglhub_PSP.yaml +107 -0
conf/orienternet.yaml +103 -0
dataset/UAV/dataset.py +116 -0
dataset/__init__.py +4 -0
dataset/dataset.py +109 -0
dataset/image.py +140 -0
dataset/torch.py +111 -0
evaluation/kitti.py +89 -0
evaluation/mapillary.py +0 -0
evaluation/run.py +252 -0
evaluation/utils.py +40 -0
evaluation/viz.py +178 -0
feature_extractor_models/__init__.py +82 -0
feature_extractor_models/__version__.py +3 -0
feature_extractor_models/base/__init__.py +13 -0
feature_extractor_models/base/heads.py +34 -0
feature_extractor_models/base/hub_mixin.py +154 -0
feature_extractor_models/base/initialization.py +26 -0
feature_extractor_models/base/model.py +71 -0
feature_extractor_models/base/modules.py +131 -0
feature_extractor_models/decoders/__init__.py +0 -0
feature_extractor_models/decoders/deeplabv3/__init__.py +3 -0
feature_extractor_models/decoders/deeplabv3/decoder.py +220 -0
feature_extractor_models/decoders/deeplabv3/model.py +178 -0
feature_extractor_models/decoders/fpn/__init__.py +3 -0
feature_extractor_models/decoders/fpn/decoder.py +133 -0
feature_extractor_models/decoders/fpn/model.py +107 -0
feature_extractor_models/decoders/lightfpn/__init__.py +3 -0
feature_extractor_models/decoders/lightfpn/decoder.py +144 -0
feature_extractor_models/decoders/lightfpn/model.py +107 -0
feature_extractor_models/decoders/linknet/__init__.py +3 -0
feature_extractor_models/decoders/linknet/decoder.py +82 -0
feature_extractor_models/decoders/linknet/model.py +98 -0
feature_extractor_models/decoders/manet/__init__.py +3 -0
feature_extractor_models/decoders/manet/decoder.py +187 -0
feature_extractor_models/decoders/manet/model.py +102 -0
feature_extractor_models/decoders/pan/__init__.py +3 -0

conf/maplocnet.yaml ADDED Viewed

	@@ -0,0 +1,105 @@

+data:
+  root: '/root/autodl-fs/DATASET/MapLocNetDataset/UAV/'
+  train_citys:
+  - Paris
+  - Berlin
+  - London
+  - Tokyo
+  - NewYork
+  val_citys:
+#  - Taipei
+#  - LosAngeles
+#  - Singapore
+  - SanFrancisco
+  test_citys:
+    - SanFrancisco
+  image_size: 256
+  train:
+    batch_size: 12
+    num_workers: 4
+  val:
+    batch_size: ${..train.batch_size}
+    num_workers: ${.batch_size}
+  num_classes:
+    areas: 7
+    ways: 10
+    nodes: 33
+  pixel_per_meter: 1
+  crop_size_meters: 64
+  max_init_error: 48
+  add_map_mask: true
+  resize_image: 512
+  pad_to_square: true
+  rectify_pitch: true
+  augmentation:
+    rot90: true
+    flip: true
+    image:
+      apply: true
+      brightness: 0.5
+      contrast: 0.4
+      saturation: 0.4
+      hue": 0.5/3.14
+model:
+  image_size: ${data.image_size}
+  latent_dim: 128
+  val_citys: ${data.val_citys}
+  image_encoder:
+    name: feature_extractor_v2
+    backbone:
+      encoder: resnet50
+      pretrained: true
+      output_dim: 8
+      num_downsample: null
+      remove_stride_from_first_conv: false
+  name: maplocnet
+  matching_dim: 8
+  z_max: 32
+  x_max: 32
+  pixel_per_meter: 1
+  num_scale_bins: 33
+  num_rotations: 64
+  map_encoder:
+    embedding_dim: 16
+    output_dim: 8
+    num_classes:
+      areas: 7
+      ways: 10
+      nodes: 33
+    backbone:
+      encoder: vgg19
+      pretrained: false
+      output_scales:
+      - 0
+      num_downsample: 3
+      decoder:
+      - 128
+      - 64
+      - 64
+      padding: replicate
+    unary_prior: false
+  bev_net:
+    num_blocks: 4
+    latent_dim: 128
+    output_dim: 8
+    confidence: true
+experiment:
+  name: maplocanet_0526_re
+  gpus: 2
+  seed: 0
+training:
+  lr: 0.0001
+  lr_scheduler: null
+  finetune_from_checkpoint: null
+  trainer:
+    val_check_interval: 1000
+    log_every_n_steps: 100
+#    limit_val_batches: 1000
+    max_steps: 200000
+    devices: ${experiment.gpus}
+  checkpointing:
+    monitor: "loss/total/val"
+    save_top_k: 10
+    mode: min
+#    filename: '{epoch}-{step}-{loss_SanFrancisco:.2f}'

conf/maplocnetsingle-101.yaml ADDED Viewed

	@@ -0,0 +1,105 @@

+data:
+  root: '/root/autodl-fs/DATASET/MapLocNetDataset/UAV/'
+  train_citys:
+  - Paris
+  - Berlin
+  - London
+  - Tokyo
+  - NewYork
+  val_citys:
+#  - Taipei
+#  - LosAngeles
+#  - Singapore
+  - SanFrancisco
+  test_citys:
+  - SanFrancisco
+  image_size: 256
+  train:
+    batch_size: 12
+    num_workers: 4
+  val:
+    batch_size: ${..train.batch_size}
+    num_workers: ${.batch_size}
+  num_classes:
+    areas: 7
+    ways: 10
+    nodes: 33
+  pixel_per_meter: 1
+  crop_size_meters: 64
+  max_init_error: 48
+  add_map_mask: true
+  resize_image: 512
+  pad_to_square: true
+  rectify_pitch: true
+  augmentation:
+    rot90: true
+    flip: true
+    image:
+      apply: True
+      brightness: 0.5
+      contrast: 0.4
+      saturation: 0.4
+      hue": 0.5/3.14
+model:
+  image_size: ${data.image_size}
+  latent_dim: 128
+  val_citys: ${data.val_citys}
+  image_encoder:
+    name: feature_extractor_v2
+    backbone:
+      encoder: resnet101
+      pretrained: true
+      output_dim: 8
+      num_downsample: null
+      remove_stride_from_first_conv: false
+  name: maplocnet
+  matching_dim: 8
+  z_max: 32
+  x_max: 32
+  pixel_per_meter: 1
+  num_scale_bins: 33
+  num_rotations: 64
+  map_encoder:
+    embedding_dim: 48
+    output_dim: 8
+    num_classes:
+        areas: 7
+        ways: 10
+        nodes: 33
+    backbone:
+      encoder: vgg19
+      pretrained: false
+      output_scales:
+      - 0
+      num_downsample: 3
+      decoder:
+      - 128
+      - 64
+      - 64
+      padding: replicate
+    unary_prior: false
+  bev_net:
+    num_blocks: 4
+    latent_dim: 128
+    output_dim: 8
+    confidence: true
+experiment:
+  name: maplocanet_523_single_A100_no_mutil_scale_augmentation_resnet101_nosingle
+  gpus: 2
+  seed: 0
+training:
+  lr: 0.0001
+  lr_scheduler: null
+  finetune_from_checkpoint: null
+  trainer:
+    val_check_interval: 1000
+    log_every_n_steps: 100
+#    limit_val_batches: 1000
+    max_steps: 200000
+    devices: ${experiment.gpus}
+  checkpointing:
+    monitor: "val/xy_recall_1m"
+    save_top_k: 10
+    mode: min
+#    filename: '{epoch}-{step}-{loss_SanFrancisco:.2f}'

conf/maplocnetsingle.yaml ADDED Viewed

	@@ -0,0 +1,105 @@

+data:
+  root: '/root/autodl-fs/DATASET/MapLocNetDataset/UAV/'
+  train_citys:
+  - Paris
+  - Berlin
+  - London
+  - Tokyo
+  - NewYork
+  val_citys:
+#  - Taipei
+#  - LosAngeles
+#  - Singapore
+  - SanFrancisco
+  test_citys:
+  - SanFrancisco
+  image_size: 256
+  train:
+    batch_size: 12
+    num_workers: 4
+  val:
+    batch_size: ${..train.batch_size}
+    num_workers: ${.batch_size}
+  num_classes:
+    areas: 7
+    ways: 10
+    nodes: 33
+  pixel_per_meter: 1
+  crop_size_meters: 64
+  max_init_error: 48
+  add_map_mask: true
+  resize_image: 512
+  pad_to_square: true
+  rectify_pitch: true
+  augmentation:
+    rot90: true
+    flip: true
+    image:
+      apply: True
+      brightness: 0.5
+      contrast: 0.4
+      saturation: 0.4
+      hue": 0.5/3.14
+model:
+  image_size: ${data.image_size}
+  latent_dim: 128
+  val_citys: ${data.val_citys}
+  image_encoder:
+    name: feature_extractor_v2
+    backbone:
+      encoder: resnet101
+      pretrained: true
+      output_dim: 8
+      num_downsample: null
+      remove_stride_from_first_conv: false
+  name: maplocnet
+  matching_dim: 8
+  z_max: 32
+  x_max: 32
+  pixel_per_meter: 1
+  num_scale_bins: 33
+  num_rotations: 64
+  map_encoder:
+    embedding_dim: 48
+    output_dim: 8
+    num_classes:
+      all: 50
+      # ways: 10
+      # nodes: 33
+    backbone:
+      encoder: vgg19
+      pretrained: false
+      output_scales:
+      - 0
+      num_downsample: 3
+      decoder:
+      - 128
+      - 64
+      - 64
+      padding: replicate
+    unary_prior: false
+  bev_net:
+    num_blocks: 4
+    latent_dim: 128
+    output_dim: 8
+    confidence: true
+experiment:
+  name: maplocanet_523_single_A100_no_mutil_scale_augmentation_resnet101_2
+  gpus: 2
+  seed: 0
+training:
+  lr: 0.0001
+  lr_scheduler: null
+  finetune_from_checkpoint: null
+  trainer:
+    val_check_interval: 1000
+    log_every_n_steps: 100
+#    limit_val_batches: 1000
+    max_steps: 200000
+    devices: ${experiment.gpus}
+  checkpointing:
+    monitor: "val/xy_recall_1m"
+    save_top_k: 10
+    mode: min
+#    filename: '{epoch}-{step}-{loss_SanFrancisco:.2f}'

conf/maplocnetsingle0526.yaml ADDED Viewed

	@@ -0,0 +1,105 @@

+data:
+  root: '/root/autodl-fs/DATASET/MapLocNetDataset/UAV/'
+  train_citys:
+  - Paris
+  - Berlin
+  - London
+  - Tokyo
+  - NewYork
+  val_citys:
+#  - Taipei
+#  - LosAngeles
+#  - Singapore
+  - SanFrancisco
+  test_citys:
+  - SanFrancisco
+  image_size: 256
+  train:
+    batch_size: 12
+    num_workers: 4
+  val:
+    batch_size: ${..train.batch_size}
+    num_workers: ${.batch_size}
+  num_classes:
+    areas: 7
+    ways: 10
+    nodes: 33
+  pixel_per_meter: 1
+  crop_size_meters: 64
+  max_init_error: 48
+  add_map_mask: true
+  resize_image: 512
+  pad_to_square: true
+  rectify_pitch: true
+  augmentation:
+    rot90: true
+    flip: true
+    image:
+      apply: false
+      brightness: 0.5
+      contrast: 0.4
+      saturation: 0.4
+      hue": 0.5/3.14
+model:
+  image_size: ${data.image_size}
+  latent_dim: 128
+  val_citys: ${data.val_citys}
+  image_encoder:
+    name: feature_extractor_v2
+    backbone:
+      encoder: resnet50
+      pretrained: true
+      output_dim: 8
+      num_downsample: null
+      remove_stride_from_first_conv: false
+  name: maplocnet
+  matching_dim: 8
+  z_max: 32
+  x_max: 32
+  pixel_per_meter: 1
+  num_scale_bins: 33
+  num_rotations: 64
+  map_encoder:
+    embedding_dim: 48
+    output_dim: 8
+    num_classes:
+      all: 50
+      # ways: 10
+      # nodes: 33
+    backbone:
+      encoder: vgg19
+      pretrained: false
+      output_scales:
+      - 0
+      num_downsample: 3
+      decoder:
+      - 128
+      - 64
+      - 64
+      padding: replicate
+    unary_prior: false
+  bev_net:
+    num_blocks: 4
+    latent_dim: 128
+    output_dim: 8
+    confidence: true
+experiment:
+  name: maplocanet_523_single_A100_no_mutil_scale
+  gpus: 2
+  seed: 0
+training:
+  lr: 0.0001
+  lr_scheduler: null
+  finetune_from_checkpoint: null
+  trainer:
+    val_check_interval: 1000
+    log_every_n_steps: 100
+#    limit_val_batches: 1000
+    max_steps: 200000
+    devices: ${experiment.gpus}
+  checkpointing:
+    monitor: "val/xy_recall_1m"
+    save_top_k: 10
+    mode: min
+#    filename: '{epoch}-{step}-{loss_SanFrancisco:.2f}'

conf/maplocnetsingleunet.yaml ADDED Viewed

	@@ -0,0 +1,105 @@

+data:
+  root: '/root/autodl-fs/DATASET/MapLocNetDataset/UAV/'
+  train_citys:
+  - Paris
+  - Berlin
+  - London
+  - Tokyo
+  - NewYork
+  val_citys:
+#  - Taipei
+#  - LosAngeles
+#  - Singapore
+  - SanFrancisco
+  test_citys:
+  - SanFrancisco
+  image_size: 256
+  train:
+    batch_size: 12
+    num_workers: 4
+  val:
+    batch_size: ${..train.batch_size}
+    num_workers: ${.batch_size}
+  num_classes:
+    areas: 7
+    ways: 10
+    nodes: 33
+  pixel_per_meter: 1
+  crop_size_meters: 64
+  max_init_error: 48
+  add_map_mask: true
+  resize_image: 512
+  pad_to_square: true
+  rectify_pitch: true
+  augmentation:
+    rot90: true
+    flip: true
+    image:
+      apply: True
+      brightness: 0.5
+      contrast: 0.4
+      saturation: 0.4
+      hue": 0.5/3.14
+model:
+  image_size: ${data.image_size}
+  latent_dim: 128
+  val_citys: ${data.val_citys}
+  image_encoder:
+    name: feature_extractor_v3
+    backbone:
+#      encoder: resnet101
+#      pretrained: true
+      output_dim: 8
+#      num_downsample: null
+#      remove_stride_from_first_conv: false
+  name: maplocnet
+  matching_dim: 8
+  z_max: 32
+  x_max: 32
+  pixel_per_meter: 1
+  num_scale_bins: 33
+  num_rotations: 64
+  map_encoder:
+    embedding_dim: 48
+    output_dim: 8
+    num_classes:
+      all: 50
+      # ways: 10
+      # nodes: 33
+    backbone:
+      encoder: vgg19
+      pretrained: false
+      output_scales:
+      - 0
+      num_downsample: 3
+      decoder:
+      - 128
+      - 64
+      - 64
+      padding: replicate
+    unary_prior: false
+  bev_net:
+    num_blocks: 4
+    latent_dim: 128
+    output_dim: 8
+    confidence: true
+experiment:
+  name: maplocanet_601_unet
+  gpus: 2
+  seed: 0
+training:
+  lr: 0.0001
+  lr_scheduler: null
+  finetune_from_checkpoint: null
+  trainer:
+    val_check_interval: 1000
+    log_every_n_steps: 100
+#    limit_val_batches: 1000
+    max_steps: 200000
+    devices: ${experiment.gpus}
+  checkpointing:
+    monitor: "val/xy_recall_1m"
+    save_top_k: 10
+    mode: min
+#    filename: '{epoch}-{step}-{loss_SanFrancisco:.2f}'

conf/maplocnetsinglhub_DDRNet.yaml ADDED Viewed

	@@ -0,0 +1,107 @@

+data:
+  root: '/root/autodl-fs/DATASET/MapLocNetDataset/UAV/'
+  train_citys:
+  - Paris
+  - Berlin
+  - London
+  - Tokyo
+  - NewYork
+  val_citys:
+#  - Taipei
+#  - LosAngeles
+#  - Singapore
+  - SanFrancisco
+  test_citys:
+  - SanFrancisco
+  image_size: 256
+  train:
+    batch_size: 12
+    num_workers: 4
+  val:
+    batch_size: ${..train.batch_size}
+    num_workers: ${.batch_size}
+  num_classes:
+    areas: 7
+    ways: 10
+    nodes: 33
+  pixel_per_meter: 1
+  crop_size_meters: 64
+  max_init_error: 48
+  add_map_mask: true
+  resize_image: 512
+  pad_to_square: true
+  rectify_pitch: true
+  augmentation:
+    rot90: true
+    flip: true
+    image:
+      apply: True
+      brightness: 0.5
+      contrast: 0.4
+      saturation: 0.4
+      hue": 0.5/3.14
+model:
+  image_size: ${data.image_size}
+  latent_dim: 128
+  val_citys: ${data.val_citys}
+  image_encoder:
+    name: feature_extractor_v5
+    architecture: DDRNet23s
+    backbone:
+#      encoder: resnet50
+#      pretrained: true
+      output_dim: 8
+#      upsampling: 2
+#      num_downsample: null
+#      remove_stride_from_first_conv: false
+  name: maplocnet
+  matching_dim: 8
+  z_max: 32
+  x_max: 32
+  pixel_per_meter: 1
+  num_scale_bins: 33
+  num_rotations: 64
+  map_encoder:
+    embedding_dim: 48
+    output_dim: 8
+    num_classes:
+      all: 50
+      # ways: 10
+      # nodes: 33
+    backbone:
+      encoder: vgg19
+      pretrained: false
+      output_scales:
+      - 0
+      num_downsample: 3
+      decoder:
+      - 128
+      - 64
+      - 64
+      padding: replicate
+    unary_prior: false
+  bev_net:
+    num_blocks: 4
+    latent_dim: 128
+    output_dim: 8
+    confidence: true
+experiment:
+  name: maplocanet_602_hub_DDRnet
+  gpus: 2
+  seed: 0
+training:
+  lr: 0.0001
+  lr_scheduler: null
+  finetune_from_checkpoint: null
+  trainer:
+    val_check_interval: 1000
+    log_every_n_steps: 100
+#    limit_val_batches: 1000
+    max_steps: 200000
+    devices: ${experiment.gpus}
+  checkpointing:
+    monitor: "val/xy_recall_1m"
+    save_top_k: 5
+    mode: max
+#    filename: '{epoch}-{step}-{loss_SanFrancisco:.2f}'

conf/maplocnetsinglhub_FPN-resnet18WeightedEmbedding.yaml ADDED Viewed

	@@ -0,0 +1,112 @@

+data:
+  root: '/root/autodl-fs/DATASET/MapLocNetDataset/UAV/'
+  train_citys:
+  - Paris
+  - Berlin
+  - London
+  - Tokyo
+  - NewYork
+  val_citys:
+#  - Taipei
+#  - LosAngeles
+#  - Singapore
+  - SanFrancisco
+  test_citys:
+  - SanFrancisco
+  image_size: 256
+  train:
+    batch_size: 12
+    num_workers: 4
+  val:
+    batch_size: ${..train.batch_size}
+    num_workers: ${.batch_size}
+  num_classes:
+    areas: 7
+    ways: 10
+    nodes: 33
+  pixel_per_meter: 1
+  crop_size_meters: 64
+  max_init_error: 48
+  add_map_mask: true
+  resize_image: 512
+  pad_to_square: true
+  rectify_pitch: true
+  augmentation:
+    rot90: false
+    flip: false
+    image:
+      apply: True
+      brightness: 0.5
+      contrast: 0.4
+      saturation: 0.4
+      hue": 0.5/3.14
+model:
+  image_size: ${data.image_size}
+  latent_dim: 128
+  val_citys: ${data.val_citys}
+  image_encoder:
+    name: feature_extractor_v4
+    architecture: FPN
+    backbone:
+      encoder: resnet18
+#      pretrained: true
+      output_dim: 8
+#      upsampling: 2
+#      num_downsample: null
+#      remove_stride_from_first_conv: false
+  name: maplocnet
+  matching_dim: 8
+  z_max: 32
+  x_max: 32
+  pixel_per_meter: 1
+  num_scale_bins: 33
+  num_rotations: 64
+  map_encoder:
+    embedding_dim: 48
+    output_dim: 8
+    weighted_embedding: ImprovedAttentionEmbedding
+    num_classes:
+      all: 50
+      # ways: 10
+      # nodes: 33
+    backbone:
+      encoder: vgg19
+      pretrained: false
+      output_scales:
+      - 0
+      num_downsample: 3
+      decoder:
+      - 128
+      - 64
+      - 64
+      padding: replicate
+    unary_prior: false
+  bev_net:
+    num_blocks: 4
+    latent_dim: 128
+    output_dim: 8
+    confidence: true
+experiment:
+  name: maplocanet_602_hub_FPN_norelu_resnet18_ImprovedAttentionEmbedding
+  gpus: 5
+  seed: 42
+training:
+  lr: 0.0001
+  lr_scheduler:
+    name: StepLR
+    args:
+      step_size: 10
+      gamma: 0.1
+  finetune_from_checkpoint: null
+  trainer:
+    val_check_interval: 1000
+    log_every_n_steps: 100
+#    limit_val_batches: 1000
+    max_steps: 300000
+    devices: ${experiment.gpus}
+  checkpointing:
+    monitor: "val/xy_recall_1m"
+    save_top_k: 5
+    mode: max
+#    filename: '{epoch}-{step}-{loss_SanFrancisco:.2f}'

conf/maplocnetsinglhub_FPN-resnet34LightWeightedEmbedding.yaml ADDED Viewed

	@@ -0,0 +1,112 @@

+data:
+  root: '/root/autodl-fs/DATASET/MapLocNetDataset/UAV/'
+  train_citys:
+  - Paris
+  - Berlin
+  - London
+  - Tokyo
+  - NewYork
+  val_citys:
+#  - Taipei
+#  - LosAngeles
+#  - Singapore
+  - SanFrancisco
+  test_citys:
+  - SanFrancisco
+  image_size: 256
+  train:
+    batch_size: 12
+    num_workers: 4
+  val:
+    batch_size: ${..train.batch_size}
+    num_workers: ${.batch_size}
+  num_classes:
+    areas: 7
+    ways: 10
+    nodes: 33
+  pixel_per_meter: 1
+  crop_size_meters: 64
+  max_init_error: 48
+  add_map_mask: true
+  resize_image: 512
+  pad_to_square: true
+  rectify_pitch: true
+  augmentation:
+    rot90: false
+    flip: false
+    image:
+      apply: True
+      brightness: 0.5
+      contrast: 0.4
+      saturation: 0.4
+      hue": 0.5/3.14
+model:
+  image_size: ${data.image_size}
+  latent_dim: 128
+  val_citys: ${data.val_citys}
+  image_encoder:
+    name: feature_extractor_v4
+    architecture: LightFPN
+    backbone:
+      encoder: resnet34
+#      pretrained: true
+      output_dim: 8
+#      upsampling: 2
+#      num_downsample: null
+#      remove_stride_from_first_conv: false
+  name: maplocnet
+  matching_dim: 8
+  z_max: 32
+  x_max: 32
+  pixel_per_meter: 1
+  num_scale_bins: 33
+  num_rotations: 64
+  map_encoder:
+    embedding_dim: 48
+    output_dim: 8
+    weighted_embedding: ImprovedAttentionEmbedding
+    num_classes:
+      all: 50
+      # ways: 10
+      # nodes: 33
+    backbone:
+      encoder: vgg19
+      pretrained: false
+      output_scales:
+      - 0
+      num_downsample: 3
+      decoder:
+      - 128
+      - 64
+      - 64
+      padding: replicate
+    unary_prior: false
+  bev_net:
+    num_blocks: 4
+    latent_dim: 128
+    output_dim: 8
+    confidence: true
+experiment:
+  name: maplocanet_602_hub_FPN_norelu_resnet34Light_ImprovedAttentionEmbedding
+  gpus: 5
+  seed: 42
+training:
+  lr: 0.0001
+  lr_scheduler:
+    name: StepLR
+    args:
+      step_size: 10
+      gamma: 0.1
+  finetune_from_checkpoint: null
+  trainer:
+    val_check_interval: 1000
+    log_every_n_steps: 100
+#    limit_val_batches: 1000
+    max_steps: 300000
+    devices: ${experiment.gpus}
+  checkpointing:
+    monitor: "val/xy_recall_1m"
+    save_top_k: 30
+    mode: max
+#    filename: '{epoch}-{step}-{loss_SanFrancisco:.2f}'

conf/maplocnetsinglhub_FPN-resnet34WeightedEmbedding.yaml ADDED Viewed

	@@ -0,0 +1,112 @@

+data:
+  root: '/root/autodl-fs/DATASET/MapLocNetDataset/UAV/'
+  train_citys:
+  - Paris
+  - Berlin
+  - London
+  - Tokyo
+  - NewYork
+  val_citys:
+#  - Taipei
+#  - LosAngeles
+#  - Singapore
+  - SanFrancisco
+  test_citys:
+  - SanFrancisco
+  image_size: 256
+  train:
+    batch_size: 12
+    num_workers: 4
+  val:
+    batch_size: ${..train.batch_size}
+    num_workers: ${.batch_size}
+  num_classes:
+    areas: 7
+    ways: 10
+    nodes: 33
+  pixel_per_meter: 1
+  crop_size_meters: 64
+  max_init_error: 48
+  add_map_mask: true
+  resize_image: 512
+  pad_to_square: true
+  rectify_pitch: true
+  augmentation:
+    rot90: false
+    flip: false
+    image:
+      apply: True
+      brightness: 0.5
+      contrast: 0.4
+      saturation: 0.4
+      hue": 0.5/3.14
+model:
+  image_size: ${data.image_size}
+  latent_dim: 128
+  val_citys: ${data.val_citys}
+  image_encoder:
+    name: feature_extractor_v4
+    architecture: FPN
+    backbone:
+      encoder: resnet34
+#      pretrained: true
+      output_dim: 8
+#      upsampling: 2
+#      num_downsample: null
+#      remove_stride_from_first_conv: false
+  name: maplocnet
+  matching_dim: 8
+  z_max: 32
+  x_max: 32
+  pixel_per_meter: 1
+  num_scale_bins: 33
+  num_rotations: 64
+  map_encoder:
+    embedding_dim: 48
+    output_dim: 8
+    weighted_embedding: ImprovedAttentionEmbedding
+    num_classes:
+      all: 50
+      # ways: 10
+      # nodes: 33
+    backbone:
+      encoder: vgg19
+      pretrained: false
+      output_scales:
+      - 0
+      num_downsample: 3
+      decoder:
+      - 128
+      - 64
+      - 64
+      padding: replicate
+    unary_prior: false
+  bev_net:
+    num_blocks: 4
+    latent_dim: 128
+    output_dim: 8
+    confidence: true
+experiment:
+  name: maplocanet_602_hub_FPN_norelu_resnet34_ImprovedAttentionEmbedding
+  gpus: 5
+  seed: 42
+training:
+  lr: 0.0001
+  lr_scheduler:
+    name: StepLR
+    args:
+      step_size: 10
+      gamma: 0.1
+  finetune_from_checkpoint: null
+  trainer:
+    val_check_interval: 1000
+    log_every_n_steps: 100
+#    limit_val_batches: 1000
+    max_steps: 300000
+    devices: ${experiment.gpus}
+  checkpointing:
+    monitor: "val/xy_recall_1m"
+    save_top_k: 5
+    mode: max
+#    filename: '{epoch}-{step}-{loss_SanFrancisco:.2f}'

conf/maplocnetsinglhub_FPN-resnet50.yaml ADDED Viewed

	@@ -0,0 +1,111 @@

+data:
+  root: '/root/autodl-fs/DATASET/MapLocNetDataset/UAV/'
+  train_citys:
+  - Paris
+  - Berlin
+  - London
+  - Tokyo
+  - NewYork
+  val_citys:
+#  - Taipei
+#  - LosAngeles
+#  - Singapore
+  - SanFrancisco
+  test_citys:
+  - SanFrancisco
+  image_size: 256
+  train:
+    batch_size: 12
+    num_workers: 4
+  val:
+    batch_size: ${..train.batch_size}
+    num_workers: ${.batch_size}
+  num_classes:
+    areas: 7
+    ways: 10
+    nodes: 33
+  pixel_per_meter: 1
+  crop_size_meters: 64
+  max_init_error: 48
+  add_map_mask: true
+  resize_image: 512
+  pad_to_square: true
+  rectify_pitch: true
+  augmentation:
+    rot90: false
+    flip: false
+    image:
+      apply: True
+      brightness: 0.5
+      contrast: 0.4
+      saturation: 0.4
+      hue": 0.5/3.14
+model:
+  image_size: ${data.image_size}
+  latent_dim: 128
+  val_citys: ${data.val_citys}
+  image_encoder:
+    name: feature_extractor_v4
+    architecture: FPN
+    backbone:
+      encoder: resnet50
+#      pretrained: true
+      output_dim: 8
+#      upsampling: 2
+#      num_downsample: null
+#      remove_stride_from_first_conv: false
+  name: maplocnet
+  matching_dim: 8
+  z_max: 32
+  x_max: 32
+  pixel_per_meter: 1
+  num_scale_bins: 33
+  num_rotations: 64
+  map_encoder:
+    embedding_dim: 48
+    output_dim: 8
+    num_classes:
+      all: 50
+      # ways: 10
+      # nodes: 33
+    backbone:
+      encoder: vgg19
+      pretrained: false
+      output_scales:
+      - 0
+      num_downsample: 3
+      decoder:
+      - 128
+      - 64
+      - 64
+      padding: replicate
+    unary_prior: false
+  bev_net:
+    num_blocks: 4
+    latent_dim: 128
+    output_dim: 8
+    confidence: true
+experiment:
+  name: maplocanet_602_hub_FPN_norelu_resnet50_temp
+  gpus: 2
+  seed: 42
+training:
+  lr: 0.0001
+  lr_scheduler:
+    name: StepLR
+    args:
+      step_size: 10
+      gamma: 0.1
+  finetune_from_checkpoint: null
+  trainer:
+    val_check_interval: 1000
+    log_every_n_steps: 100
+#    limit_val_batches: 1000
+    max_steps: 300000
+    devices: ${experiment.gpus}
+  checkpointing:
+    monitor: "val/xy_recall_1m"
+    save_top_k: 5
+    mode: max
+#    filename: '{epoch}-{step}-{loss_SanFrancisco:.2f}'

conf/maplocnetsinglhub_FPN-resnet50WeightedEmbedding.yaml ADDED Viewed

	@@ -0,0 +1,112 @@

+data:
+  root: '/root/autodl-fs/DATASET/MapLocNetDataset/UAV/'
+  train_citys:
+  - Paris
+  - Berlin
+  - London
+  - Tokyo
+  - NewYork
+  val_citys:
+#  - Taipei
+#  - LosAngeles
+#  - Singapore
+  - SanFrancisco
+  test_citys:
+  - SanFrancisco
+  image_size: 256
+  train:
+    batch_size: 12
+    num_workers: 4
+  val:
+    batch_size: ${..train.batch_size}
+    num_workers: ${.batch_size}
+  num_classes:
+    areas: 7
+    ways: 10
+    nodes: 33
+  pixel_per_meter: 1
+  crop_size_meters: 64
+  max_init_error: 48
+  add_map_mask: true
+  resize_image: 512
+  pad_to_square: true
+  rectify_pitch: true
+  augmentation:
+    rot90: false
+    flip: false
+    image:
+      apply: True
+      brightness: 0.5
+      contrast: 0.4
+      saturation: 0.4
+      hue": 0.5/3.14
+model:
+  image_size: ${data.image_size}
+  latent_dim: 128
+  val_citys: ${data.val_citys}
+  image_encoder:
+    name: feature_extractor_v4
+    architecture: FPN
+    backbone:
+      encoder: resnet50
+#      pretrained: true
+      output_dim: 8
+#      upsampling: 2
+#      num_downsample: null
+#      remove_stride_from_first_conv: false
+  name: maplocnet
+  matching_dim: 8
+  z_max: 32
+  x_max: 32
+  pixel_per_meter: 1
+  num_scale_bins: 33
+  num_rotations: 64
+  map_encoder:
+    embedding_dim: 48
+    output_dim: 8
+    weighted_embedding: ImprovedAttentionEmbedding
+    num_classes:
+      all: 50
+      # ways: 10
+      # nodes: 33
+    backbone:
+      encoder: vgg19
+      pretrained: false
+      output_scales:
+      - 0
+      num_downsample: 3
+      decoder:
+      - 128
+      - 64
+      - 64
+      padding: replicate
+    unary_prior: false
+  bev_net:
+    num_blocks: 4
+    latent_dim: 128
+    output_dim: 8
+    confidence: true
+experiment:
+  name: maplocanet_602_hub_FPN_norelu_resnet50_ImprovedAttentionEmbedding
+  gpus: 3
+  seed: 42
+training:
+  lr: 0.0001
+  lr_scheduler:
+    name: StepLR
+    args:
+      step_size: 10
+      gamma: 0.1
+  finetune_from_checkpoint: null
+  trainer:
+    val_check_interval: 1000
+    log_every_n_steps: 100
+#    limit_val_batches: 1000
+    max_steps: 300000
+    devices: ${experiment.gpus}
+  checkpointing:
+    monitor: "val/xy_recall_1m"
+    save_top_k: 5
+    mode: max
+#    filename: '{epoch}-{step}-{loss_SanFrancisco:.2f}'

conf/maplocnetsinglhub_FPN.yaml ADDED Viewed

	@@ -0,0 +1,107 @@

+data:
+  root: '/root/autodl-fs/DATASET/MapLocNetDataset/UAV/'
+  train_citys:
+  - Paris
+  - Berlin
+  - London
+  - Tokyo
+  - NewYork
+  val_citys:
+#  - Taipei
+#  - LosAngeles
+#  - Singapore
+  - SanFrancisco
+  test_citys:
+  - SanFrancisco
+  image_size: 256
+  train:
+    batch_size: 12
+    num_workers: 4
+  val:
+    batch_size: ${..train.batch_size}
+    num_workers: ${.batch_size}
+  num_classes:
+    areas: 7
+    ways: 10
+    nodes: 33
+  pixel_per_meter: 1
+  crop_size_meters: 64
+  max_init_error: 48
+  add_map_mask: true
+  resize_image: 512
+  pad_to_square: true
+  rectify_pitch: true
+  augmentation:
+    rot90: true
+    flip: true
+    image:
+      apply: True
+      brightness: 0.5
+      contrast: 0.4
+      saturation: 0.4
+      hue": 0.5/3.14
+model:
+  image_size: ${data.image_size}
+  latent_dim: 128
+  val_citys: ${data.val_citys}
+  image_encoder:
+    name: feature_extractor_v4
+    architecture: FPN
+    backbone:
+      encoder: resnet101
+#      pretrained: true
+      output_dim: 8
+#      upsampling: 2
+#      num_downsample: null
+#      remove_stride_from_first_conv: false
+  name: maplocnet
+  matching_dim: 8
+  z_max: 32
+  x_max: 32
+  pixel_per_meter: 1
+  num_scale_bins: 33
+  num_rotations: 64
+  map_encoder:
+    embedding_dim: 48
+    output_dim: 8
+    num_classes:
+      all: 50
+      # ways: 10
+      # nodes: 33
+    backbone:
+      encoder: vgg19
+      pretrained: false
+      output_scales:
+      - 0
+      num_downsample: 3
+      decoder:
+      - 128
+      - 64
+      - 64
+      padding: replicate
+    unary_prior: false
+  bev_net:
+    num_blocks: 4
+    latent_dim: 128
+    output_dim: 8
+    confidence: true
+experiment:
+  name: maplocanet_602_hub_FPN_Resnet50_norelu
+  gpus: 2
+  seed: 0
+training:
+  lr: 0.0001
+  lr_scheduler: null
+  finetune_from_checkpoint: null
+  trainer:
+    val_check_interval: 1000
+    log_every_n_steps: 100
+#    limit_val_batches: 1000
+    max_steps: 200000
+    devices: ${experiment.gpus}
+  checkpointing:
+    monitor: "val/xy_recall_1m"
+    save_top_k: 10
+    mode: min
+#    filename: '{epoch}-{step}-{loss_SanFrancisco:.2f}'

conf/maplocnetsinglhub_FPN_Mobileone.yaml ADDED Viewed

	@@ -0,0 +1,107 @@

+data:
+  root: '/root/autodl-fs/DATASET/MapLocNetDataset/UAV/'
+  train_citys:
+  - Paris
+  - Berlin
+  - London
+  - Tokyo
+  - NewYork
+  val_citys:
+#  - Taipei
+#  - LosAngeles
+#  - Singapore
+  - SanFrancisco
+  test_citys:
+  - SanFrancisco
+  image_size: 256
+  train:
+    batch_size: 12
+    num_workers: 4
+  val:
+    batch_size: ${..train.batch_size}
+    num_workers: ${.batch_size}
+  num_classes:
+    areas: 7
+    ways: 10
+    nodes: 33
+  pixel_per_meter: 1
+  crop_size_meters: 64
+  max_init_error: 48
+  add_map_mask: true
+  resize_image: 512
+  pad_to_square: true
+  rectify_pitch: true
+  augmentation:
+    rot90: true
+    flip: true
+    image:
+      apply: True
+      brightness: 0.5
+      contrast: 0.4
+      saturation: 0.4
+      hue": 0.5/3.14
+model:
+  image_size: ${data.image_size}
+  latent_dim: 128
+  val_citys: ${data.val_citys}
+  image_encoder:
+    name: feature_extractor_v4
+    architecture: FPN
+    backbone:
+      encoder: mobileone_s3
+#      pretrained: true
+      output_dim: 8
+#      upsampling: 2
+#      num_downsample: null
+#      remove_stride_from_first_conv: false
+  name: maplocnet
+  matching_dim: 8
+  z_max: 32
+  x_max: 32
+  pixel_per_meter: 1
+  num_scale_bins: 33
+  num_rotations: 64
+  map_encoder:
+    embedding_dim: 48
+    output_dim: 8
+    num_classes:
+      all: 50
+      # ways: 10
+      # nodes: 33
+    backbone:
+      encoder: vgg19
+      pretrained: false
+      output_scales:
+      - 0
+      num_downsample: 3
+      decoder:
+      - 128
+      - 64
+      - 64
+      padding: replicate
+    unary_prior: false
+  bev_net:
+    num_blocks: 4
+    latent_dim: 128
+    output_dim: 8
+    confidence: true
+experiment:
+  name: maplocnetsinglhub_FPN_mobileone_s3
+  gpus: 2
+  seed: 0
+training:
+  lr: 0.0001
+  lr_scheduler: null
+  finetune_from_checkpoint: null
+  trainer:
+    val_check_interval: 1000
+    log_every_n_steps: 100
+#    limit_val_batches: 1000
+    max_steps: 300000
+    devices: ${experiment.gpus}
+  checkpointing:
+    monitor: "val/xy_recall_1m"
+    save_top_k: 10
+    mode: max
+#    filename: '{epoch}-{step}-{loss_SanFrancisco:.2f}'

conf/maplocnetsinglhub_PSP.yaml ADDED Viewed

	@@ -0,0 +1,107 @@

+data:
+  root: '/root/autodl-fs/DATASET/MapLocNetDataset/UAV/'
+  train_citys:
+  - Paris
+  - Berlin
+  - London
+  - Tokyo
+  - NewYork
+  val_citys:
+#  - Taipei
+#  - LosAngeles
+#  - Singapore
+  - SanFrancisco
+  test_citys:
+  - SanFrancisco
+  image_size: 256
+  train:
+    batch_size: 12
+    num_workers: 4
+  val:
+    batch_size: ${..train.batch_size}
+    num_workers: ${.batch_size}
+  num_classes:
+    areas: 7
+    ways: 10
+    nodes: 33
+  pixel_per_meter: 1
+  crop_size_meters: 64
+  max_init_error: 48
+  add_map_mask: true
+  resize_image: 512
+  pad_to_square: true
+  rectify_pitch: true
+  augmentation:
+    rot90: true
+    flip: true
+    image:
+      apply: True
+      brightness: 0.5
+      contrast: 0.4
+      saturation: 0.4
+      hue": 0.5/3.14
+model:
+  image_size: ${data.image_size}
+  latent_dim: 128
+  val_citys: ${data.val_citys}
+  image_encoder:
+    name: feature_extractor_v4
+    architecture: PSP
+    backbone:
+      encoder: resnet50
+#      pretrained: true
+      output_dim: 8
+#      upsampling: 2
+#      num_downsample: null
+#      remove_stride_from_first_conv: false
+  name: maplocnet
+  matching_dim: 8
+  z_max: 32
+  x_max: 32
+  pixel_per_meter: 1
+  num_scale_bins: 33
+  num_rotations: 64
+  map_encoder:
+    embedding_dim: 48
+    output_dim: 8
+    num_classes:
+      all: 50
+      # ways: 10
+      # nodes: 33
+    backbone:
+      encoder: vgg19
+      pretrained: false
+      output_scales:
+      - 0
+      num_downsample: 3
+      decoder:
+      - 128
+      - 64
+      - 64
+      padding: replicate
+    unary_prior: false
+  bev_net:
+    num_blocks: 4
+    latent_dim: 128
+    output_dim: 8
+    confidence: true
+experiment:
+  name: maplocanet_602_hub_PSP
+  gpus: 2
+  seed: 0
+training:
+  lr: 0.0001
+  lr_scheduler: null
+  finetune_from_checkpoint: null
+  trainer:
+    val_check_interval: 1000
+    log_every_n_steps: 100
+#    limit_val_batches: 1000
+    max_steps: 300000
+    devices: ${experiment.gpus}
+  checkpointing:
+    monitor: "val/xy_recall_1m"
+    save_top_k: 5
+    mode: max
+#    filename: '{epoch}-{step}-{loss_SanFrancisco:.2f}'

conf/orienternet.yaml ADDED Viewed

	@@ -0,0 +1,103 @@

+data:
+  root: '/home/ubuntu/media/MapLocNetDataset/UAV/'
+  train_citys:
+  - Paris
+  - Berlin
+  - London
+  - Tokyo
+  - NewYork
+  val_citys:
+#  - Taipei
+#  - LosAngeles
+#  - Singapore
+  - SanFrancisco
+  image_size: 256
+  train:
+    batch_size: 12
+    num_workers: 4
+  val:
+    batch_size: ${..train.batch_size}
+    num_workers: ${.batch_size}
+  num_classes:
+    areas: 7
+    ways: 10
+    nodes: 33
+  pixel_per_meter: 1
+  crop_size_meters: 64
+  max_init_error: 48
+  add_map_mask: true
+  resize_image: 512
+  pad_to_square: true
+  rectify_pitch: true
+  augmentation:
+    rot90: true
+#    flip: true
+    image:
+      apply: true
+      brightness: 0.5
+      contrast: 0.4
+      saturation: 0.4
+      hue": 0.5/3.14
+model:
+  image_size: ${data.image_size}
+  latent_dim: 128
+  val_citys: ${data.val_citys}
+  image_encoder:
+    name: feature_extractor_v2
+    backbone:
+      encoder: resnet101
+      pretrained: true
+      output_dim: 8
+      num_downsample: null
+      remove_stride_from_first_conv: false
+  name: orienternet
+  matching_dim: 8
+  z_max: 32
+  x_max: 32
+  pixel_per_meter: 1
+  num_scale_bins: 33
+  num_rotations: 64
+  map_encoder:
+    embedding_dim: 16
+    output_dim: 8
+    num_classes:
+      areas: 7
+      ways: 10
+      nodes: 33
+    backbone:
+      encoder: vgg19
+      pretrained: false
+      output_scales:
+      - 0
+      num_downsample: 3
+      decoder:
+      - 128
+      - 64
+      - 64
+      padding: replicate
+    unary_prior: false
+  bev_net:
+    num_blocks: 4
+    latent_dim: 128
+    output_dim: 8
+    confidence: true
+experiment:
+  name: OrienterNet_my_multi_city_debug_code_0815_2_monitor_metric
+  gpus: 4
+  seed: 0
+training:
+  lr: 0.0001
+  lr_scheduler: null
+  finetune_from_checkpoint: null
+  trainer:
+    val_check_interval: 1000
+    log_every_n_steps: 100
+#    limit_val_batches: 1000
+    max_steps: 200000
+    devices: ${experiment.gpus}
+  checkpointing:
+    monitor: "loss/total/val"
+    save_top_k: 10
+    mode: min
+#    filename: '{epoch}-{step}-{loss_SanFrancisco:.2f}'

dataset/UAV/dataset.py ADDED Viewed

	@@ -0,0 +1,116 @@

+import torch
+from torch.utils.data import Dataset
+import os
+import cv2
+# @Time : 2023-02-13 22:56
+# @Author : Wang Zhen
+# @Email : [email protected]
+# @File : SatelliteTool.py
+# @Project : TGRS_seqmatch_2023_1
+import numpy as np
+import random
+from utils.geo import BoundaryBox, Projection
+from osm.tiling import TileManager,MapTileManager
+from pathlib import Path
+from torchvision import transforms
+from torch.utils.data import DataLoader
+class UavMapPair(Dataset):
+    def __init__(
+        self,
+        root: Path,
+        city:str,
+            training:bool,
+            transform
+    ):
+        super().__init__()
+        # self.root = root
+        # city = 'Manhattan'
+        # root = '/root/DATASET/CrossModel/'
+        # root=Path(root)
+        self.uav_image_path = root/city/'uav'
+        self.map_path = root/city/'map'
+        self.map_vis = root / city / 'map_vis'
+        info_path = root / city / 'info.csv'
+        self.info = np.loadtxt(str(info_path), dtype=str, delimiter=",", skiprows=1)
+        self.transform=transform
+        self.training=training
+    def random_center_crop(self,image):
+        height, width = image.shape[:2]
+        # 随机生成剪裁尺寸
+        crop_size = random.randint(min(height, width) // 2, min(height, width))
+        # 计算剪裁的起始坐标
+        start_x = (width - crop_size) // 2
+        start_y = (height - crop_size) // 2
+        # 进行剪裁
+        cropped_image = image[start_y:start_y + crop_size, start_x:start_x + crop_size]
+        return cropped_image
+    def __getitem__(self, index: int):
+        id, uav_name, map_name, \
+            uav_long, uav_lat, \
+            map_long, map_lat, \
+            tile_size_meters, pixel_per_meter, \
+            u, v, yaw,dis=self.info[index]
+        uav_image=cv2.imread(str(self.uav_image_path/uav_name))
+        if self.training:
+            uav_image =self.random_center_crop(uav_image)
+        uav_image=cv2.cvtColor(uav_image,cv2.COLOR_BGR2RGB)
+        if self.transform:
+            uav_image=self.transform(uav_image)
+        map=np.load(str(self.map_path/map_name))
+        return {
+            'map':torch.from_numpy(np.ascontiguousarray(map)).long(),
+            'image':torch.tensor(uav_image),
+            'roll_pitch_yaw':torch.tensor((0, 0, float(yaw))).float(),
+            'pixels_per_meter':torch.tensor(float(pixel_per_meter)).float(),
+            "uv":torch.tensor([float(u), float(v)]).float(),
+        }
+    def __len__(self):
+        return len(self.info)
+if __name__ == '__main__':
+    root=Path('/root/DATASET/OrienterNet/UavMap/')
+    city='NewYork'
+    transform = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Resize(256),
+        transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))
+    ])
+    dataset=UavMapPair(
+        root=root,
+        city=city,
+        transform=transform
+    )
+    datasetloder = DataLoader(dataset, batch_size=3)
+    for batch, i in enumerate(datasetloder):
+        pass
+        # 将PyTorch张量转换为PIL图像
+        # pil_image = Image.fromarray(i['uav_image'][0].permute(1, 2, 0).byte().numpy())
+        # 显示图像
+        # 将PyTorch张量转换为NumPy数组
+        # numpy_array = i['uav_image'][0].numpy()
+        #
+        # # 显示图像
+        # plt.imshow(numpy_array.transpose(1, 2, 0))
+        # plt.axis('off')
+        # plt.show()
+        #
+        # map_viz, label = Colormap.apply(i['map'][0])
+        # map_viz = map_viz * 255
+        # map_viz = map_viz.astype(np.uint8)
+        # plot_images([map_viz], titles=["OpenStreetMap raster"])

dataset/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+# from .UAV.dataset import UavMapPair
+from .dataset import UavMapDatasetModule
+# modules = {"UAV": UavMapPair}

dataset/dataset.py ADDED Viewed

	@@ -0,0 +1,109 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+from copy import deepcopy
+from pathlib import Path
+from typing import Any, Dict, List
+# from logger import logger
+import numpy as np
+# import torch
+# import torch.utils.data as torchdata
+# import torchvision.transforms as tvf
+from omegaconf import DictConfig, OmegaConf
+import pytorch_lightning as pl
+from dataset.UAV.dataset import UavMapPair
+# from torch.utils.data import Dataset, DataLoader
+# from torchvision import transforms
+from torch.utils.data import Dataset, ConcatDataset
+from torch.utils.data import Dataset, DataLoader, random_split
+import torchvision.transforms as tvf
+# 自定义数据模块类，继承自pl.LightningDataModule
+class UavMapDatasetModule(pl.LightningDataModule):
+    def __init__(self, cfg: Dict[str, Any]):
+        super().__init__()
+        # default_cfg = OmegaConf.create(self.default_cfg)
+        # OmegaConf.set_struct(default_cfg, True)  # cannot add new keys
+        # self.cfg = OmegaConf.merge(default_cfg, cfg)
+        self.cfg=cfg
+        # self.transform = tvf.Compose([
+        #     tvf.ToTensor(),
+        #     tvf.Resize(self.cfg.image_size),
+        #     tvf.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))
+        # ])
+        tfs = []
+        tfs.append(tvf.ToTensor())
+        tfs.append(tvf.Resize(self.cfg.image_size))
+        self.val_tfs = tvf.Compose(tfs)
+        #     transforms.Resize(self.cfg.image_size),
+        if cfg.augmentation.image.apply:
+            args = OmegaConf.masked_copy(
+                cfg.augmentation.image, ["brightness", "contrast", "saturation", "hue"]
+            )
+            tfs.append(tvf.ColorJitter(**args))
+        self.train_tfs = tvf.Compose(tfs)
+        # self.train_tfs=self.transform
+        # self.val_tfs = self.transform
+        self.init()
+    def init(self):
+        self.train_dataset = ConcatDataset([
+            UavMapPair(root=Path(self.cfg.root),city=city,training=False,transform=self.train_tfs)
+            for city in self.cfg.train_citys
+        ])
+        self.val_dataset = ConcatDataset([
+            UavMapPair(root=Path(self.cfg.root),city=city,training=False,transform=self.val_tfs)
+            for city in self.cfg.val_citys
+        ])
+        self.test_dataset = ConcatDataset([
+            UavMapPair(root=Path(self.cfg.root),city=city,training=False,transform=self.val_tfs)
+            for city in self.cfg.test_citys
+        ])
+        # self.val_datasets = {
+        #     city:UavMapPair(root=Path(self.cfg.root),city=city,transform=self.val_tfs)
+        #     for city in self.cfg.val_citys
+        #     }
+        # logger.info("train data len:{},val data len:{}".format(len(self.train_dataset),len(self.val_dataset)))
+        # # 定义分割比例
+        # train_ratio = 0.8  # 训练集比例
+        # # 计算分割的样本数量
+        # train_size = int(len(self.dataset) * train_ratio)
+        # val_size = len(self.dataset) - train_size
+        # self.train_dataset, self.val_dataset = random_split(self.dataset, [train_size, val_size])
+    def train_dataloader(self):
+        train_loader = DataLoader(self.train_dataset,
+                                  batch_size=self.cfg.train.batch_size,
+                                  num_workers=self.cfg.train.num_workers,
+                                  shuffle=True,pin_memory = True)
+        return train_loader
+    def val_dataloader(self):
+        val_loader = DataLoader(self.val_dataset,
+                                batch_size=self.cfg.val.batch_size,
+                                num_workers=self.cfg.val.num_workers,
+                                shuffle=True,pin_memory = True)
+        #
+        # my_dict = {k: v for k, v in self.val_datasets}
+        # val_loaders={city: DataLoader(dataset,
+        #                         batch_size=self.cfg.val.batch_size,
+        #                         num_workers=self.cfg.val.num_workers,
+        #                         shuffle=False,pin_memory = True) for city, dataset in self.val_datasets.items()}
+        return val_loader
+    def test_dataloader(self):
+        val_loader = DataLoader(self.test_dataset,
+                                batch_size=self.cfg.val.batch_size,
+                                num_workers=self.cfg.val.num_workers,
+                                shuffle=True,pin_memory = True)
+        #
+        # my_dict = {k: v for k, v in self.val_datasets}
+        # val_loaders={city: DataLoader(dataset,
+        #                         batch_size=self.cfg.val.batch_size,
+        #                         num_workers=self.cfg.val.num_workers,
+        #                         shuffle=False,pin_memory = True) for city, dataset in self.val_datasets.items()}
+        return val_loader

dataset/image.py ADDED Viewed

	@@ -0,0 +1,140 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+from typing import Callable, Optional, Union, Sequence
+import numpy as np
+import torch
+import torchvision.transforms.functional as tvf
+import collections
+from scipy.spatial.transform import Rotation
+from utils.geometry import from_homogeneous, to_homogeneous
+from utils.wrappers import Camera
+def rectify_image(
+    image: torch.Tensor,
+    cam: Camera,
+    roll: float,
+    pitch: Optional[float] = None,
+    valid: Optional[torch.Tensor] = None,
+):
+    *_, h, w = image.shape
+    grid = torch.meshgrid(
+        [torch.arange(w, device=image.device), torch.arange(h, device=image.device)],
+        indexing="xy",
+    )
+    grid = torch.stack(grid, -1).to(image.dtype)
+    if pitch is not None:
+        args = ("ZX", (roll, pitch))
+    else:
+        args = ("Z", roll)
+    R = Rotation.from_euler(*args, degrees=True).as_matrix()
+    R = torch.from_numpy(R).to(image)
+    grid_rect = to_homogeneous(cam.normalize(grid)) @ R.T
+    grid_rect = cam.denormalize(from_homogeneous(grid_rect))
+    grid_norm = (grid_rect + 0.5) / grid.new_tensor([w, h]) * 2 - 1
+    rectified = torch.nn.functional.grid_sample(
+        image[None],
+        grid_norm[None],
+        align_corners=False,
+        mode="bilinear",
+    ).squeeze(0)
+    if valid is None:
+        valid = torch.all((grid_norm >= -1) & (grid_norm <= 1), -1)
+    else:
+        valid = (
+            torch.nn.functional.grid_sample(
+                valid[None, None].float(),
+                grid_norm[None],
+                align_corners=False,
+                mode="nearest",
+            )[0, 0]
+            > 0
+        )
+    return rectified, valid
+def resize_image(
+    image: torch.Tensor,
+    size: Union[int, Sequence, np.ndarray],
+    fn: Optional[Callable] = None,
+    camera: Optional[Camera] = None,
+    valid: np.ndarray = None,
+):
+    """Resize an image to a fixed size, or according to max or min edge."""
+    *_, h, w = image.shape
+    if fn is not None:
+        assert isinstance(size, int)
+        scale = size / fn(h, w)
+        h_new, w_new = int(round(h * scale)), int(round(w * scale))
+        scale = (scale, scale)
+    else:
+        if isinstance(size, (collections.abc.Sequence, np.ndarray)):
+            w_new, h_new = size
+        elif isinstance(size, int):
+            w_new = h_new = size
+        else:
+            raise ValueError(f"Incorrect new size: {size}")
+        scale = (w_new / w, h_new / h)
+    if (w, h) != (w_new, h_new):
+        mode = tvf.InterpolationMode.BILINEAR
+        image = tvf.resize(image, (h_new, w_new), interpolation=mode, antialias=True)
+        image.clip_(0, 1)
+        if camera is not None:
+            camera = camera.scale(scale)
+        if valid is not None:
+            valid = tvf.resize(
+                valid.unsqueeze(0),
+                (h_new, w_new),
+                interpolation=tvf.InterpolationMode.NEAREST,
+            ).squeeze(0)
+    ret = [image, scale]
+    if camera is not None:
+        ret.append(camera)
+    if valid is not None:
+        ret.append(valid)
+    return ret
+def pad_image(
+    image: torch.Tensor,
+    size: Union[int, Sequence, np.ndarray],
+    camera: Optional[Camera] = None,
+    valid: torch.Tensor = None,
+    crop_and_center: bool = False,
+):
+    if isinstance(size, int):
+        w_new = h_new = size
+    elif isinstance(size, (collections.abc.Sequence, np.ndarray)):
+        w_new, h_new = size
+    else:
+        raise ValueError(f"Incorrect new size: {size}")
+    *c, h, w = image.shape
+    if crop_and_center:
+        diff = np.array([w - w_new, h - h_new])
+        left, top = left_top = np.round(diff / 2).astype(int)
+        right, bottom = diff - left_top
+    else:
+        assert h <= h_new
+        assert w <= w_new
+        top = bottom = left = right = 0
+    slice_out = np.s_[..., : min(h, h_new), : min(w, w_new)]
+    slice_in = np.s_[
+        ..., max(top, 0) : h - max(bottom, 0), max(left, 0) : w - max(right, 0)
+    ]
+    if (w, h) == (w_new, h_new):
+        out = image
+    else:
+        out = torch.zeros((*c, h_new, w_new), dtype=image.dtype)
+        out[slice_out] = image[slice_in]
+        if camera is not None:
+            camera = camera.crop((max(left, 0), max(top, 0)), (w_new, h_new))
+    out_valid = torch.zeros((h_new, w_new), dtype=torch.bool)
+    out_valid[slice_out] = True if valid is None else valid[slice_in]
+    if camera is not None:
+        return out, out_valid, camera
+    else:
+        return out, out_valid

dataset/torch.py ADDED Viewed

	@@ -0,0 +1,111 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+import collections
+import os
+import torch
+from torch.utils.data import get_worker_info
+from torch.utils.data._utils.collate import (
+    default_collate_err_msg_format,
+    np_str_obj_array_pattern,
+)
+from lightning_fabric.utilities.seed import pl_worker_init_function
+from lightning_utilities.core.apply_func import apply_to_collection
+from lightning_fabric.utilities.apply_func import move_data_to_device
+def collate(batch):
+    """Difference with PyTorch default_collate: it can stack other tensor-like objects.
+    Adapted from PixLoc, Paul-Edouard Sarlin, ETH Zurich
+    https://github.com/cvg/pixloc
+    Released under the Apache License 2.0
+    """
+    if not isinstance(batch, list):  # no batching
+        return batch
+    elem = batch[0]
+    elem_type = type(elem)
+    if isinstance(elem, torch.Tensor):
+        out = None
+        if torch.utils.data.get_worker_info() is not None:
+            # If we're in a background process, concatenate directly into a
+            # shared memory tensor to avoid an extra copy
+            numel = sum(x.numel() for x in batch)
+            storage = elem.storage()._new_shared(numel, device=elem.device)
+            out = elem.new(storage).resize_(len(batch), *list(elem.size()))
+        return torch.stack(batch, 0, out=out)
+    elif (
+        elem_type.__module__ == "numpy"
+        and elem_type.__name__ != "str_"
+        and elem_type.__name__ != "string_"
+    ):
+        if elem_type.__name__ == "ndarray" or elem_type.__name__ == "memmap":
+            # array of string classes and object
+            if np_str_obj_array_pattern.search(elem.dtype.str) is not None:
+                raise TypeError(default_collate_err_msg_format.format(elem.dtype))
+            return collate([torch.as_tensor(b) for b in batch])
+        elif elem.shape == ():  # scalars
+            return torch.as_tensor(batch)
+    elif isinstance(elem, float):
+        return torch.tensor(batch, dtype=torch.float64)
+    elif isinstance(elem, int):
+        return torch.tensor(batch)
+    elif isinstance(elem, (str, bytes)):
+        return batch
+    elif isinstance(elem, collections.abc.Mapping):
+        return {key: collate([d[key] for d in batch]) for key in elem}
+    elif isinstance(elem, tuple) and hasattr(elem, "_fields"):  # namedtuple
+        return elem_type(*(collate(samples) for samples in zip(*batch)))
+    elif isinstance(elem, collections.abc.Sequence):
+        # check to make sure that the elements in batch have consistent size
+        it = iter(batch)
+        elem_size = len(next(it))
+        if not all(len(elem) == elem_size for elem in it):
+            raise RuntimeError("each element in list of batch should be of equal size")
+        transposed = zip(*batch)
+        return [collate(samples) for samples in transposed]
+    else:
+        # try to stack anyway in case the object implements stacking.
+        try:
+            return torch.stack(batch, 0)
+        except TypeError as e:
+            if "expected Tensor as element" in str(e):
+                return batch
+            else:
+                raise e
+def set_num_threads(nt):
+    """Force numpy and other libraries to use a limited number of threads."""
+    try:
+        import mkl
+    except ImportError:
+        pass
+    else:
+        mkl.set_num_threads(nt)
+    torch.set_num_threads(1)
+    os.environ["IPC_ENABLE"] = "1"
+    for o in [
+        "OPENBLAS_NUM_THREADS",
+        "NUMEXPR_NUM_THREADS",
+        "OMP_NUM_THREADS",
+        "MKL_NUM_THREADS",
+    ]:
+        os.environ[o] = str(nt)
+def worker_init_fn(i):
+    info = get_worker_info()
+    pl_worker_init_function(info.id)
+    num_threads = info.dataset.cfg.get("num_threads")
+    if num_threads is not None:
+        set_num_threads(num_threads)
+def unbatch_to_device(data, device="cpu"):
+    data = move_data_to_device(data, device)
+    data = apply_to_collection(data, torch.Tensor, lambda x: x.squeeze(0))
+    data = apply_to_collection(
+        data, list, lambda x: x[0] if len(x) == 1 and isinstance(x[0], str) else x
+    )
+    return data

evaluation/kitti.py ADDED Viewed

	@@ -0,0 +1,89 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+import argparse
+from pathlib import Path
+from typing import Optional, Tuple
+from omegaconf import OmegaConf, DictConfig
+from .. import logger
+from ..data import KittiDataModule
+from .run import evaluate
+default_cfg_single = OmegaConf.create({})
+# For the sequential evaluation, we need to center the map around the GT location,
+# since random offsets would accumulate and leave only the GT location with a valid mask.
+# This should not have much impact on the results.
+default_cfg_sequential = OmegaConf.create(
+    {
+        "data": {
+            "mask_radius": KittiDataModule.default_cfg["max_init_error"],
+            "prior_range_rotation": KittiDataModule.default_cfg[
+                "max_init_error_rotation"
+            ]
+            + 1,
+            "max_init_error": 0,
+            "max_init_error_rotation": 0,
+        },
+        "chunking": {
+            "max_length": 100,  # about 10s?
+        },
+    }
+)
+def run(
+    split: str,
+    experiment: str,
+    cfg: Optional[DictConfig] = None,
+    sequential: bool = False,
+    thresholds: Tuple[int] = (1, 3, 5),
+    **kwargs,
+):
+    cfg = cfg or {}
+    if isinstance(cfg, dict):
+        cfg = OmegaConf.create(cfg)
+    default = default_cfg_sequential if sequential else default_cfg_single
+    cfg = OmegaConf.merge(default, cfg)
+    dataset = KittiDataModule(cfg.get("data", {}))
+    metrics = evaluate(
+        experiment,
+        cfg,
+        dataset,
+        split=split,
+        sequential=sequential,
+        viz_kwargs=dict(show_dir_error=True, show_masked_prob=False),
+        **kwargs,
+    )
+    keys = ["directional_error", "yaw_max_error"]
+    if sequential:
+        keys += ["directional_seq_error", "yaw_seq_error"]
+    for k in keys:
+        rec = metrics[k].recall(thresholds).double().numpy().round(2).tolist()
+        logger.info("Recall %s: %s at %s m/°", k, rec, thresholds)
+    return metrics
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--experiment", type=str, required=True)
+    parser.add_argument(
+        "--split", type=str, default="test", choices=["test", "val", "train"]
+    )
+    parser.add_argument("--sequential", action="store_true")
+    parser.add_argument("--output_dir", type=Path)
+    parser.add_argument("--num", type=int)
+    parser.add_argument("dotlist", nargs="*")
+    args = parser.parse_args()
+    cfg = OmegaConf.from_cli(args.dotlist)
+    run(
+        args.split,
+        args.experiment,
+        cfg,
+        args.sequential,
+        output_dir=args.output_dir,
+        num=args.num,
+    )

evaluation/mapillary.py ADDED Viewed

File without changes

evaluation/run.py ADDED Viewed

	@@ -0,0 +1,252 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+import functools
+from itertools import islice
+from typing import Callable, Dict, Optional, Tuple
+from pathlib import Path
+import numpy as np
+import torch
+from omegaconf import DictConfig, OmegaConf
+from torchmetrics import MetricCollection
+from pytorch_lightning import seed_everything
+from tqdm import tqdm
+from logger import logger, EXPERIMENTS_PATH
+from dataset.torch import collate, unbatch_to_device
+from models.voting import argmax_xyr, fuse_gps
+from models.metrics import AngleError, LateralLongitudinalError, Location2DError
+from models.sequential import GPSAligner, RigidAligner
+from module import GenericModule
+from utils.io import download_file, DATA_URL
+from evaluation.viz import plot_example_single, plot_example_sequential
+from evaluation.utils import write_dump
+pretrained_models = dict(
+    OrienterNet_MGL=("orienternet_mgl.ckpt", dict(num_rotations=256)),
+)
+def resolve_checkpoint_path(experiment_or_path: str) -> Path:
+    path = Path(experiment_or_path)
+    if not path.exists():
+        # provided name of experiment
+        path = Path(EXPERIMENTS_PATH, *experiment_or_path.split("/"))
+        if not path.exists():
+            if experiment_or_path in set(p for p, _ in pretrained_models.values()):
+                download_file(f"{DATA_URL}/{experiment_or_path}", path)
+            else:
+                raise FileNotFoundError(path)
+    if path.is_file():
+        return path
+    # provided only the experiment name
+    maybe_path = path / "last-step-v1.ckpt"
+    if not maybe_path.exists():
+        maybe_path = path / "last.ckpt"
+    if not maybe_path.exists():
+        raise FileNotFoundError(f"Could not find any checkpoint in {path}.")
+    return maybe_path
+@torch.no_grad()
+def evaluate_single_image(
+    dataloader: torch.utils.data.DataLoader,
+    model: GenericModule,
+    num: Optional[int] = None,
+    callback: Optional[Callable] = None,
+    progress: bool = True,
+    mask_index: Optional[Tuple[int]] = None,
+    has_gps: bool = False,
+):
+    ppm = model.model.conf.pixel_per_meter
+    metrics = MetricCollection(model.model.metrics())
+    metrics["directional_error"] = LateralLongitudinalError(ppm)
+    if has_gps:
+        metrics["xy_gps_error"] = Location2DError("uv_gps", ppm)
+        metrics["xy_fused_error"] = Location2DError("uv_fused", ppm)
+        metrics["yaw_fused_error"] = AngleError("yaw_fused")
+    metrics = metrics.to(model.device)
+    for i, batch_ in enumerate(
+        islice(tqdm(dataloader, total=num, disable=not progress), num)
+    ):
+        batch = model.transfer_batch_to_device(batch_, model.device, i)
+        # Ablation: mask semantic classes
+        if mask_index is not None:
+            mask = batch["map"][0, mask_index[0]] == (mask_index[1] + 1)
+            batch["map"][0, mask_index[0]][mask] = 0
+        pred = model(batch)
+        if has_gps:
+            (uv_gps,) = pred["uv_gps"] = batch["uv_gps"]
+            pred["log_probs_fused"] = fuse_gps(
+                pred["log_probs"], uv_gps, ppm, sigma=batch["accuracy_gps"]
+            )
+            uvt_fused = argmax_xyr(pred["log_probs_fused"])
+            pred["uv_fused"] = uvt_fused[..., :2]
+            pred["yaw_fused"] = uvt_fused[..., -1]
+            del uv_gps, uvt_fused
+        results = metrics(pred, batch)
+        if callback is not None:
+            callback(
+                i, model, unbatch_to_device(pred), unbatch_to_device(batch_), results
+            )
+        del batch_, batch, pred, results
+    return metrics.cpu()
+@torch.no_grad()
+def evaluate_sequential(
+    dataset: torch.utils.data.Dataset,
+    chunk2idx: Dict,
+    model: GenericModule,
+    num: Optional[int] = None,
+    shuffle: bool = False,
+    callback: Optional[Callable] = None,
+    progress: bool = True,
+    num_rotations: int = 512,
+    mask_index: Optional[Tuple[int]] = None,
+    has_gps: bool = True,
+):
+    chunk_keys = list(chunk2idx)
+    if shuffle:
+        chunk_keys = [chunk_keys[i] for i in torch.randperm(len(chunk_keys))]
+    if num is not None:
+        chunk_keys = chunk_keys[:num]
+    lengths = [len(chunk2idx[k]) for k in chunk_keys]
+    logger.info(
+        "Min/max/med lengths: %d/%d/%d, total number of images: %d",
+        min(lengths),
+        np.median(lengths),
+        max(lengths),
+        sum(lengths),
+    )
+    viz = callback is not None
+    metrics = MetricCollection(model.model.metrics())
+    ppm = model.model.conf.pixel_per_meter
+    metrics["directional_error"] = LateralLongitudinalError(ppm)
+    metrics["xy_seq_error"] = Location2DError("uv_seq", ppm)
+    metrics["yaw_seq_error"] = AngleError("yaw_seq")
+    metrics["directional_seq_error"] = LateralLongitudinalError(ppm, key="uv_seq")
+    if has_gps:
+        metrics["xy_gps_error"] = Location2DError("uv_gps", ppm)
+        metrics["xy_gps_seq_error"] = Location2DError("uv_gps_seq", ppm)
+        metrics["yaw_gps_seq_error"] = AngleError("yaw_gps_seq")
+    metrics = metrics.to(model.device)
+    keys_save = ["uvr_max", "uv_max", "yaw_max", "uv_expectation"]
+    if has_gps:
+        keys_save.append("uv_gps")
+    if viz:
+        keys_save.append("log_probs")
+    for chunk_index, key in enumerate(tqdm(chunk_keys, disable=not progress)):
+        indices = chunk2idx[key]
+        aligner = RigidAligner(track_priors=viz, num_rotations=num_rotations)
+        if has_gps:
+            aligner_gps = GPSAligner(track_priors=viz, num_rotations=num_rotations)
+        batches = []
+        preds = []
+        for i in indices:
+            data = dataset[i]
+            data = model.transfer_batch_to_device(data, model.device, 0)
+            pred = model(collate([data]))
+            canvas = data["canvas"]
+            data["xy_geo"] = xy = canvas.to_xy(data["uv"].double())
+            data["yaw"] = yaw = data["roll_pitch_yaw"][-1].double()
+            aligner.update(pred["log_probs"][0], canvas, xy, yaw)
+            if has_gps:
+                (uv_gps) = pred["uv_gps"] = data["uv_gps"][None]
+                xy_gps = canvas.to_xy(uv_gps.double())
+                aligner_gps.update(xy_gps, data["accuracy_gps"], canvas, xy, yaw)
+            if not viz:
+                data.pop("image")
+                data.pop("map")
+            batches.append(data)
+            preds.append({k: pred[k][0] for k in keys_save})
+            del pred
+        xy_gt = torch.stack([b["xy_geo"] for b in batches])
+        yaw_gt = torch.stack([b["yaw"] for b in batches])
+        aligner.compute()
+        xy_seq, yaw_seq = aligner.transform(xy_gt, yaw_gt)
+        if has_gps:
+            aligner_gps.compute()
+            xy_gps_seq, yaw_gps_seq = aligner_gps.transform(xy_gt, yaw_gt)
+        results = []
+        for i in range(len(indices)):
+            preds[i]["uv_seq"] = batches[i]["canvas"].to_uv(xy_seq[i]).float()
+            preds[i]["yaw_seq"] = yaw_seq[i].float()
+            if has_gps:
+                preds[i]["uv_gps_seq"] = (
+                    batches[i]["canvas"].to_uv(xy_gps_seq[i]).float()
+                )
+                preds[i]["yaw_gps_seq"] = yaw_gps_seq[i].float()
+            results.append(metrics(preds[i], batches[i]))
+        if viz:
+            callback(chunk_index, model, batches, preds, results, aligner)
+        del aligner, preds, batches, results
+    return metrics.cpu()
+def evaluate(
+    experiment: str,
+    cfg: DictConfig,
+    dataset,
+    split: str,
+    sequential: bool = False,
+    output_dir: Optional[Path] = None,
+    callback: Optional[Callable] = None,
+    num_workers: int = 1,
+    viz_kwargs=None,
+    **kwargs,
+):
+    if experiment in pretrained_models:
+        experiment, cfg_override = pretrained_models[experiment]
+        cfg = OmegaConf.merge(OmegaConf.create(dict(model=cfg_override)), cfg)
+    logger.info("Evaluating model %s with config %s", experiment, cfg)
+    checkpoint_path = resolve_checkpoint_path(experiment)
+    model = GenericModule.load_from_checkpoint(
+        checkpoint_path, cfg=cfg, find_best=not experiment.endswith(".ckpt")
+    )
+    model = model.eval()
+    if torch.cuda.is_available():
+        model = model.cuda()
+    dataset.prepare_data()
+    dataset.setup()
+    if output_dir is not None:
+        output_dir.mkdir(exist_ok=True, parents=True)
+        if callback is None:
+            if sequential:
+                callback = plot_example_sequential
+            else:
+                callback = plot_example_single
+            callback = functools.partial(
+                callback, out_dir=output_dir, **(viz_kwargs or {})
+            )
+    kwargs = {**kwargs, "callback": callback}
+    seed_everything(dataset.cfg.seed)
+    if sequential:
+        dset, chunk2idx = dataset.sequence_dataset(split, **cfg.chunking)
+        metrics = evaluate_sequential(dset, chunk2idx, model, **kwargs)
+    else:
+        loader = dataset.dataloader(split, shuffle=True, num_workers=num_workers)
+        metrics = evaluate_single_image(loader, model, **kwargs)
+    results = metrics.compute()
+    logger.info("All results: %s", results)
+    if output_dir is not None:
+        write_dump(output_dir, experiment, cfg, results, metrics)
+        logger.info("Outputs have been written to %s.", output_dir)
+    return metrics

evaluation/utils.py ADDED Viewed

	@@ -0,0 +1,40 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+import numpy as np
+from omegaconf import OmegaConf
+from utils.io import write_json
+def compute_recall(errors):
+    num_elements = len(errors)
+    sort_idx = np.argsort(errors)
+    errors = np.array(errors.copy())[sort_idx]
+    recall = (np.arange(num_elements) + 1) / num_elements
+    recall = np.r_[0, recall]
+    errors = np.r_[0, errors]
+    return errors, recall
+def compute_auc(errors, recall, thresholds):
+    aucs = []
+    for t in thresholds:
+        last_index = np.searchsorted(errors, t, side="right")
+        r = np.r_[recall[:last_index], recall[last_index - 1]]
+        e = np.r_[errors[:last_index], t]
+        auc = np.trapz(r, x=e) / t
+        aucs.append(auc * 100)
+    return aucs
+def write_dump(output_dir, experiment, cfg, results, metrics):
+    dump = {
+        "experiment": experiment,
+        "cfg": OmegaConf.to_container(cfg),
+        "results": results,
+        "errors": {},
+    }
+    for k, m in metrics.items():
+        if hasattr(m, "get_errors"):
+            dump["errors"][k] = m.get_errors().numpy()
+    write_json(output_dir / "log.json", dump)

evaluation/viz.py ADDED Viewed

	@@ -0,0 +1,178 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+import numpy as np
+import torch
+import matplotlib.pyplot as plt
+from utils.io import write_torch_image
+from utils.viz_2d import plot_images, features_to_RGB, save_plot
+from utils.viz_localization import (
+    likelihood_overlay,
+    plot_pose,
+    plot_dense_rotations,
+    add_circle_inset,
+)
+from osm.viz import Colormap, plot_nodes
+def plot_example_single(
+    idx,
+    model,
+    pred,
+    data,
+    results,
+    plot_bev=True,
+    out_dir=None,
+    fig_for_paper=False,
+    show_gps=False,
+    show_fused=False,
+    show_dir_error=False,
+    show_masked_prob=False,
+):
+    scene, name, rasters, uv_gt = (data[k] for k in ("scene", "name", "map", "uv"))
+    uv_gps = data.get("uv_gps")
+    yaw_gt = data["roll_pitch_yaw"][-1].numpy()
+    image = data["image"].permute(1, 2, 0)
+    if "valid" in data:
+        image = image.masked_fill(~data["valid"].unsqueeze(-1), 0.3)
+    lp_uvt = lp_uv = pred["log_probs"]
+    if show_fused and "log_probs_fused" in pred:
+        lp_uvt = lp_uv = pred["log_probs_fused"]
+    elif not show_masked_prob and "scores_unmasked" in pred:
+        lp_uvt = lp_uv = pred["scores_unmasked"]
+    has_rotation = lp_uvt.ndim == 3
+    if has_rotation:
+        lp_uv = lp_uvt.max(-1).values
+    if lp_uv.min() > -np.inf:
+        lp_uv = lp_uv.clip(min=np.percentile(lp_uv, 1))
+    prob = lp_uv.exp()
+    uv_p, yaw_p = pred["uv_max"], pred.get("yaw_max")
+    if show_fused and "uv_fused" in pred:
+        uv_p, yaw_p = pred["uv_fused"], pred.get("yaw_fused")
+    feats_map = pred["map"]["map_features"][0]
+    (feats_map_rgb,) = features_to_RGB(feats_map.numpy())
+    text1 = rf'$\Delta xy$: {results["xy_max_error"]:.1f}m'
+    if has_rotation:
+        text1 += rf', $\Delta\theta$: {results["yaw_max_error"]:.1f}°'
+    if show_fused and "xy_fused_error" in results:
+        text1 += rf', $\Delta xy_{{fused}}$: {results["xy_fused_error"]:.1f}m'
+        text1 += rf', $\Delta\theta_{{fused}}$: {results["yaw_fused_error"]:.1f}°'
+    if show_dir_error and "directional_error" in results:
+        err_lat, err_lon = results["directional_error"]
+        text1 += rf",  $\Delta$lateral/longitundinal={err_lat:.1f}m/{err_lon:.1f}m"
+    if "xy_gps_error" in results:
+        text1 += rf',  $\Delta xy_{{GPS}}$: {results["xy_gps_error"]:.1f}m'
+    map_viz = Colormap.apply(rasters)
+    overlay = likelihood_overlay(prob.numpy(), map_viz.mean(-1, keepdims=True))
+    plot_images(
+        [image, map_viz, overlay, feats_map_rgb],
+        titles=[text1, "map", "likelihood", "neural map"],
+        dpi=75,
+        cmaps="jet",
+    )
+    fig = plt.gcf()
+    axes = fig.axes
+    axes[1].images[0].set_interpolation("none")
+    axes[2].images[0].set_interpolation("none")
+    Colormap.add_colorbar()
+    plot_nodes(1, rasters[2])
+    if show_gps and uv_gps is not None:
+        plot_pose([1], uv_gps, c="blue")
+    plot_pose([1], uv_gt, yaw_gt, c="red")
+    plot_pose([1], uv_p, yaw_p, c="k")
+    plot_dense_rotations(2, lp_uvt.exp())
+    inset_center = pred["uv_max"] if results["xy_max_error"] < 5 else uv_gt
+    axins = add_circle_inset(axes[2], inset_center)
+    axins.scatter(*uv_gt, lw=1, c="red", ec="k", s=50, zorder=15)
+    axes[0].text(
+        0.003,
+        0.003,
+        f"{scene}/{name}",
+        transform=axes[0].transAxes,
+        fontsize=3,
+        va="bottom",
+        ha="left",
+        color="w",
+    )
+    plt.show()
+    if out_dir is not None:
+        name_ = name.replace("/", "_")
+        p = str(out_dir / f"{scene}_{name_}_{{}}.pdf")
+        save_plot(p.format("pred"))
+        plt.close()
+        if fig_for_paper:
+            # !cp ../datasets/MGL/{scene}/images/{name}.jpg {out_dir}/{scene}_{name}.jpg
+            plot_images([map_viz])
+            plt.gca().images[0].set_interpolation("none")
+            plot_nodes(0, rasters[2])
+            plot_pose([0], uv_gt, yaw_gt, c="red")
+            plot_pose([0], pred["uv_max"], pred["yaw_max"], c="k")
+            save_plot(p.format("map"))
+            plt.close()
+            plot_images([lp_uv], cmaps="jet")
+            plot_dense_rotations(0, lp_uvt.exp())
+            save_plot(p.format("loglikelihood"), dpi=100)
+            plt.close()
+            plot_images([overlay])
+            plt.gca().images[0].set_interpolation("none")
+            axins = add_circle_inset(plt.gca(), inset_center)
+            axins.scatter(*uv_gt, lw=1, c="red", ec="k", s=50)
+            save_plot(p.format("likelihood"))
+            plt.close()
+            write_torch_image(
+                p.format("neuralmap").replace("pdf", "jpg"), feats_map_rgb
+            )
+            write_torch_image(p.format("image").replace("pdf", "jpg"), image.numpy())
+    if not plot_bev:
+        return
+    feats_q = pred["features_bev"]
+    mask_bev = pred["valid_bev"]
+    prior = None
+    if "log_prior" in pred["map"]:
+        prior = pred["map"]["log_prior"][0].sigmoid()
+    if "bev" in pred and "confidence" in pred["bev"]:
+        conf_q = pred["bev"]["confidence"]
+    else:
+        conf_q = torch.norm(feats_q, dim=0)
+    conf_q = conf_q.masked_fill(~mask_bev, np.nan)
+    (feats_q_rgb,) = features_to_RGB(feats_q.numpy(), masks=[mask_bev.numpy()])
+    # feats_map_rgb, feats_q_rgb, = features_to_RGB(
+    #     feats_map.numpy(), feats_q.numpy(), masks=[None, mask_bev])
+    norm_map = torch.norm(feats_map, dim=0)
+    plot_images(
+        [conf_q, feats_q_rgb, norm_map] + ([] if prior is None else [prior]),
+        titles=["BEV confidence", "BEV features", "map norm"]
+        + ([] if prior is None else ["map prior"]),
+        dpi=50,
+        cmaps="jet",
+    )
+    plt.show()
+    if out_dir is not None:
+        save_plot(p.format("bev"))
+        plt.close()
+def plot_example_sequential(
+    idx,
+    model,
+    pred,
+    data,
+    results,
+    plot_bev=True,
+    out_dir=None,
+    fig_for_paper=False,
+    show_gps=False,
+    show_fused=False,
+    show_dir_error=False,
+    show_masked_prob=False,
+):
+    return

feature_extractor_models/__init__.py ADDED Viewed

	@@ -0,0 +1,82 @@

+from . import encoders
+from . import decoders
+from .decoders.unet import Unet
+from .decoders.unetplusplus import UnetPlusPlus
+from .decoders.manet import MAnet
+from .decoders.linknet import Linknet
+from .decoders.fpn import FPN
+from .decoders.lightfpn import LightFPN
+from .decoders.pspnet import PSPNet
+from .decoders.deeplabv3 import DeepLabV3, DeepLabV3Plus
+from .decoders.pan import PAN
+from .base.hub_mixin import from_pretrained
+from .__version__ import __version__
+# some private imports for create_model function
+from typing import Optional as _Optional
+import torch as _torch
+def create_model(
+    arch: str,
+    encoder_name: str = "resnet34",
+    encoder_weights: _Optional[str] = "imagenet",
+    in_channels: int = 3,
+    classes: int = 1,
+    **kwargs,
+) -> _torch.nn.Module:
+    """Models entrypoint, allows to create any model architecture just with
+    parameters, without using its class
+    """
+    archs = [
+        Unet,
+        UnetPlusPlus,
+        MAnet,
+        Linknet,
+        FPN,
+        LightFPN,
+        PSPNet,
+        DeepLabV3,
+        DeepLabV3Plus,
+        PAN,
+    ]
+    archs_dict = {a.__name__.lower(): a for a in archs}
+    try:
+        model_class = archs_dict[arch.lower()]
+    except KeyError:
+        raise KeyError(
+            "Wrong architecture type `{}`. Available options are: {}".format(
+                arch, list(archs_dict.keys())
+            )
+        )
+    return model_class(
+        encoder_name=encoder_name,
+        encoder_weights=encoder_weights,
+        in_channels=in_channels,
+        classes=classes,
+        **kwargs,
+    )
+__all__ = [
+    "encoders",
+    "decoders",
+    "Unet",
+    "UnetPlusPlus",
+    "MAnet",
+    "Linknet",
+    "FPN",
+    "LightFPN",
+    "PSPNet",
+    "DeepLabV3",
+    "DeepLabV3Plus",
+    "PAN",
+    "from_pretrained",
+    "create_model",
+    "__version__",
+]

feature_extractor_models/__version__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ VERSION = (0, 3, "4dev0")
2	+
3	+ __version__ = ".".join(map(str, VERSION))

feature_extractor_models/base/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from .model import SegmentationModel
+from .modules import Conv2dReLU, Attention
+from .heads import SegmentationHead, ClassificationHead
+__all__ = [
+    "SegmentationModel",
+    "Conv2dReLU",
+    "Attention",
+    "SegmentationHead",
+    "ClassificationHead",
+]

feature_extractor_models/base/heads.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import torch.nn as nn
+from .modules import Activation
+class SegmentationHead(nn.Sequential):
+    def __init__(
+        self, in_channels, out_channels, kernel_size=3, activation=None, upsampling=1
+    ):
+        conv2d = nn.Conv2d(
+            in_channels, out_channels, kernel_size=kernel_size, padding=kernel_size // 2
+        )
+        upsampling = (
+            nn.UpsamplingBilinear2d(scale_factor=upsampling)
+            if upsampling > 1
+            else nn.Identity()
+        )
+        activation = Activation(activation)
+        super().__init__(conv2d, upsampling, activation)
+class ClassificationHead(nn.Sequential):
+    def __init__(
+        self, in_channels, classes, pooling="avg", dropout=0.2, activation=None
+    ):
+        if pooling not in ("max", "avg"):
+            raise ValueError(
+                "Pooling should be one of ('max', 'avg'), got {}.".format(pooling)
+            )
+        pool = nn.AdaptiveAvgPool2d(1) if pooling == "avg" else nn.AdaptiveMaxPool2d(1)
+        flatten = nn.Flatten()
+        dropout = nn.Dropout(p=dropout, inplace=True) if dropout else nn.Identity()
+        linear = nn.Linear(in_channels, classes, bias=True)
+        activation = Activation(activation)
+        super().__init__(pool, flatten, dropout, linear, activation)

feature_extractor_models/base/hub_mixin.py ADDED Viewed

	@@ -0,0 +1,154 @@

+import json
+from pathlib import Path
+from typing import Optional, Union
+from functools import wraps
+from huggingface_hub import (
+    PyTorchModelHubMixin,
+    ModelCard,
+    ModelCardData,
+    hf_hub_download,
+)
+MODEL_CARD = """
+---
+{{ card_data }}
+---
+# {{ model_name }} Model Card
+Table of Contents:
+- [Load trained model](#load-trained-model)
+- [Model init parameters](#model-init-parameters)
+- [Model metrics](#model-metrics)
+- [Dataset](#dataset)
+## Load trained model
+```python
+import feature_extractor_models as smp
+model = smp.{{ model_name }}.from_pretrained("{{ save_directory | default("<save-directory-or-repo>", true)}}")
+```
+## Model init parameters
+```python
+model_init_params = {{ model_parameters }}
+```
+## Model metrics
+{{ metrics | default("[More Information Needed]", true) }}
+## Dataset
+Dataset name: {{ dataset | default("[More Information Needed]", true) }}
+## More Information
+- Library: {{ repo_url | default("[More Information Needed]", true) }}
+- Docs: {{ docs_url | default("[More Information Needed]", true) }}
+This model has been pushed to the Hub using the [PytorchModelHubMixin](https://huggingface.co/docs/huggingface_hub/package_reference/mixins#huggingface_hub.PyTorchModelHubMixin)
+"""
+def _format_parameters(parameters: dict):
+    params = {k: v for k, v in parameters.items() if not k.startswith("_")}
+    params = [
+        f'"{k}": {v}' if not isinstance(v, str) else f'"{k}": "{v}"'
+        for k, v in params.items()
+    ]
+    params = ",\n".join([f"    {param}" for param in params])
+    params = "{\n" + f"{params}" + "\n}"
+    return params
+class SMPHubMixin(PyTorchModelHubMixin):
+    def generate_model_card(self, *args, **kwargs) -> ModelCard:
+        model_parameters_json = _format_parameters(self._hub_mixin_config)
+        directory = self._save_directory if hasattr(self, "_save_directory") else None
+        repo_id = self._repo_id if hasattr(self, "_repo_id") else None
+        repo_or_directory = repo_id if repo_id is not None else directory
+        metrics = self._metrics if hasattr(self, "_metrics") else None
+        dataset = self._dataset if hasattr(self, "_dataset") else None
+        if metrics is not None:
+            metrics = json.dumps(metrics, indent=4)
+            metrics = f"```json\n{metrics}\n```"
+        model_card_data = ModelCardData(
+            languages=["python"],
+            library_name="segmentation-models-pytorch",
+            license="mit",
+            tags=["semantic-segmentation", "pytorch", "segmentation-models-pytorch"],
+            pipeline_tag="image-segmentation",
+        )
+        model_card = ModelCard.from_template(
+            card_data=model_card_data,
+            template_str=MODEL_CARD,
+            repo_url="https://github.com/qubvel/segmentation_models.pytorch",
+            docs_url="https://smp.readthedocs.io/en/latest/",
+            model_parameters=model_parameters_json,
+            save_directory=repo_or_directory,
+            model_name=self.__class__.__name__,
+            metrics=metrics,
+            dataset=dataset,
+        )
+        return model_card
+    def _set_attrs_from_kwargs(self, attrs, kwargs):
+        for attr in attrs:
+            if attr in kwargs:
+                setattr(self, f"_{attr}", kwargs.pop(attr))
+    def _del_attrs(self, attrs):
+        for attr in attrs:
+            if hasattr(self, f"_{attr}"):
+                delattr(self, f"_{attr}")
+    @wraps(PyTorchModelHubMixin.save_pretrained)
+    def save_pretrained(
+        self, save_directory: Union[str, Path], *args, **kwargs
+    ) -> Optional[str]:
+        # set additional attributes to be used in generate_model_card
+        self._save_directory = save_directory
+        self._set_attrs_from_kwargs(["metrics", "dataset"], kwargs)
+        # set additional attribute to be used in from_pretrained
+        self._hub_mixin_config["_model_class"] = self.__class__.__name__
+        try:
+            # call the original save_pretrained
+            result = super().save_pretrained(save_directory, *args, **kwargs)
+        finally:
+            # delete the additional attributes
+            self._del_attrs(["save_directory", "metrics", "dataset"])
+            self._hub_mixin_config.pop("_model_class")
+        return result
+    @wraps(PyTorchModelHubMixin.push_to_hub)
+    def push_to_hub(self, repo_id: str, *args, **kwargs):
+        self._repo_id = repo_id
+        self._set_attrs_from_kwargs(["metrics", "dataset"], kwargs)
+        result = super().push_to_hub(repo_id, *args, **kwargs)
+        self._del_attrs(["repo_id", "metrics", "dataset"])
+        return result
+    @property
+    def config(self):
+        return self._hub_mixin_config
+@wraps(PyTorchModelHubMixin.from_pretrained)
+def from_pretrained(pretrained_model_name_or_path: str, *args, **kwargs):
+    config_path = hf_hub_download(
+        pretrained_model_name_or_path,
+        filename="config.json",
+        revision=kwargs.get("revision", None),
+    )
+    with open(config_path, "r") as f:
+        config = json.load(f)
+    model_class_name = config.pop("_model_class")
+    import feature_extractor_models as smp
+    model_class = getattr(smp, model_class_name)
+    return model_class.from_pretrained(pretrained_model_name_or_path, *args, **kwargs)

feature_extractor_models/base/initialization.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import torch.nn as nn
+def initialize_decoder(module):
+    for m in module.modules():
+        if isinstance(m, nn.Conv2d):
+            nn.init.kaiming_uniform_(m.weight, mode="fan_in", nonlinearity="relu")
+            if m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.BatchNorm2d):
+            nn.init.constant_(m.weight, 1)
+            nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.Linear):
+            nn.init.xavier_uniform_(m.weight)
+            if m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+def initialize_head(module):
+    for m in module.modules():
+        if isinstance(m, (nn.Linear, nn.Conv2d)):
+            nn.init.xavier_uniform_(m.weight)
+            if m.bias is not None:
+                nn.init.constant_(m.bias, 0)

feature_extractor_models/base/model.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import torch
+from . import initialization as init
+from .hub_mixin import SMPHubMixin
+import torch.nn as nn
+class SegmentationModel(torch.nn.Module, SMPHubMixin):
+    def initialize(self):
+        # self.out = nn.Sequential(
+        #     nn.Conv2d(out_channels, out_channels, 3, padding=1, bias=False),
+        #     nn.BatchNorm2d(8),
+        #     nn.ReLU(inplace=True),
+        # )
+        init.initialize_decoder(self.decoder)
+        init.initialize_head(self.segmentation_head)
+        if self.classification_head is not None:
+            init.initialize_head(self.classification_head)
+    def check_input_shape(self, x):
+        h, w = x.shape[-2:]
+        output_stride = self.encoder.output_stride
+        if h % output_stride != 0 or w % output_stride != 0:
+            new_h = (
+                (h // output_stride + 1) * output_stride
+                if h % output_stride != 0
+                else h
+            )
+            new_w = (
+                (w // output_stride + 1) * output_stride
+                if w % output_stride != 0
+                else w
+            )
+            raise RuntimeError(
+                f"Wrong input shape height={h}, width={w}. Expected image height and width "
+                f"divisible by {output_stride}. Consider pad your images to shape ({new_h}, {new_w})."
+            )
+    def forward(self, x):
+        """Sequentially pass `x` trough model`s encoder, decoder and heads"""
+        self.check_input_shape(x)
+        features = self.encoder(x)
+        decoder_output = self.decoder(*features)
+        decoder_output = self.segmentation_head(decoder_output)
+        #
+        # if self.classification_head is not None:
+        #     labels = self.classification_head(features[-1])
+        #     return masks, labels
+        return decoder_output
+    @torch.no_grad()
+    def predict(self, x):
+        """Inference method. Switch model to `eval` mode, call `.forward(x)` with `torch.no_grad()`
+        Args:
+            x: 4D torch tensor with shape (batch_size, channels, height, width)
+        Return:
+            prediction: 4D torch tensor with shape (batch_size, classes, height, width)
+        """
+        if self.training:
+            self.eval()
+        x = self.forward(x)
+        return x

feature_extractor_models/base/modules.py ADDED Viewed

	@@ -0,0 +1,131 @@

+import torch
+import torch.nn as nn
+try:
+    from inplace_abn import InPlaceABN
+except ImportError:
+    InPlaceABN = None
+class Conv2dReLU(nn.Sequential):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        padding=0,
+        stride=1,
+        use_batchnorm=True,
+    ):
+        if use_batchnorm == "inplace" and InPlaceABN is None:
+            raise RuntimeError(
+                "In order to use `use_batchnorm='inplace'` inplace_abn package must be installed. "
+                + "To install see: https://github.com/mapillary/inplace_abn"
+            )
+        conv = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            bias=not (use_batchnorm),
+        )
+        relu = nn.ReLU(inplace=True)
+        if use_batchnorm == "inplace":
+            bn = InPlaceABN(out_channels, activation="leaky_relu", activation_param=0.0)
+            relu = nn.Identity()
+        elif use_batchnorm and use_batchnorm != "inplace":
+            bn = nn.BatchNorm2d(out_channels)
+        else:
+            bn = nn.Identity()
+        super(Conv2dReLU, self).__init__(conv, bn, relu)
+class SCSEModule(nn.Module):
+    def __init__(self, in_channels, reduction=16):
+        super().__init__()
+        self.cSE = nn.Sequential(
+            nn.AdaptiveAvgPool2d(1),
+            nn.Conv2d(in_channels, in_channels // reduction, 1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(in_channels // reduction, in_channels, 1),
+            nn.Sigmoid(),
+        )
+        self.sSE = nn.Sequential(nn.Conv2d(in_channels, 1, 1), nn.Sigmoid())
+    def forward(self, x):
+        return x * self.cSE(x) + x * self.sSE(x)
+class ArgMax(nn.Module):
+    def __init__(self, dim=None):
+        super().__init__()
+        self.dim = dim
+    def forward(self, x):
+        return torch.argmax(x, dim=self.dim)
+class Clamp(nn.Module):
+    def __init__(self, min=0, max=1):
+        super().__init__()
+        self.min, self.max = min, max
+    def forward(self, x):
+        return torch.clamp(x, self.min, self.max)
+class Activation(nn.Module):
+    def __init__(self, name, **params):
+        super().__init__()
+        if name is None or name == "identity":
+            self.activation = nn.Identity(**params)
+        elif name == "sigmoid":
+            self.activation = nn.Sigmoid()
+        elif name == "relu":
+            self.activation = nn.ReLU(inplace=True)
+        elif name == "softmax2d":
+            self.activation = nn.Softmax(dim=1, **params)
+        elif name == "softmax":
+            self.activation = nn.Softmax(**params)
+        elif name == "logsoftmax":
+            self.activation = nn.LogSoftmax(**params)
+        elif name == "tanh":
+            self.activation = nn.Tanh()
+        elif name == "argmax":
+            self.activation = ArgMax(**params)
+        elif name == "argmax2d":
+            self.activation = ArgMax(dim=1, **params)
+        elif name == "clamp":
+            self.activation = Clamp(**params)
+        elif callable(name):
+            self.activation = name(**params)
+        else:
+            raise ValueError(
+                f"Activation should be callable/sigmoid/softmax/logsoftmax/tanh/"
+                f"argmax/argmax2d/clamp/None; got {name}"
+            )
+    def forward(self, x):
+        return self.activation(x)
+class Attention(nn.Module):
+    def __init__(self, name, **params):
+        super().__init__()
+        if name is None:
+            self.attention = nn.Identity(**params)
+        elif name == "scse":
+            self.attention = SCSEModule(**params)
+        else:
+            raise ValueError("Attention {} is not implemented".format(name))
+    def forward(self, x):
+        return self.attention(x)

feature_extractor_models/decoders/__init__.py ADDED Viewed

File without changes

feature_extractor_models/decoders/deeplabv3/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .model import DeepLabV3, DeepLabV3Plus
2	+
3	+ __all__ = ["DeepLabV3", "DeepLabV3Plus"]

feature_extractor_models/decoders/deeplabv3/decoder.py ADDED Viewed

	@@ -0,0 +1,220 @@

+"""
+BSD 3-Clause License
+Copyright (c) Soumith Chintala 2016,
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+* Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+"""
+import torch
+from torch import nn
+from torch.nn import functional as F
+__all__ = ["DeepLabV3Decoder"]
+class DeepLabV3Decoder(nn.Sequential):
+    def __init__(self, in_channels, out_channels=256, atrous_rates=(12, 24, 36)):
+        super().__init__(
+            ASPP(in_channels, out_channels, atrous_rates),
+            nn.Conv2d(out_channels, out_channels, 3, padding=1, bias=False),
+            nn.BatchNorm2d(out_channels),
+            nn.ReLU(),
+        )
+        self.out_channels = out_channels
+    def forward(self, *features):
+        return super().forward(features[-1])
+class DeepLabV3PlusDecoder(nn.Module):
+    def __init__(
+        self,
+        encoder_channels,
+        out_channels=256,
+        atrous_rates=(12, 24, 36),
+        output_stride=16,
+    ):
+        super().__init__()
+        if output_stride not in {8, 16}:
+            raise ValueError(
+                "Output stride should be 8 or 16, got {}.".format(output_stride)
+            )
+        self.out_channels = out_channels
+        self.output_stride = output_stride
+        self.aspp = nn.Sequential(
+            ASPP(encoder_channels[-1], out_channels, atrous_rates, separable=True),
+            SeparableConv2d(
+                out_channels, out_channels, kernel_size=3, padding=1, bias=False
+            ),
+            nn.BatchNorm2d(out_channels),
+            nn.ReLU(),
+        )
+        scale_factor = 2 if output_stride == 8 else 4
+        self.up = nn.UpsamplingBilinear2d(scale_factor=scale_factor)
+        highres_in_channels = encoder_channels[-4]
+        highres_out_channels = 48  # proposed by authors of paper
+        self.block1 = nn.Sequential(
+            nn.Conv2d(
+                highres_in_channels, highres_out_channels, kernel_size=1, bias=False
+            ),
+            nn.BatchNorm2d(highres_out_channels),
+            nn.ReLU(),
+        )
+        self.block2 = nn.Sequential(
+            SeparableConv2d(
+                highres_out_channels + out_channels,
+                out_channels,
+                kernel_size=3,
+                padding=1,
+                bias=False,
+            ),
+            nn.BatchNorm2d(out_channels),
+            nn.ReLU(),
+        )
+    def forward(self, *features):
+        aspp_features = self.aspp(features[-1])
+        aspp_features = self.up(aspp_features)
+        high_res_features = self.block1(features[-4])
+        concat_features = torch.cat([aspp_features, high_res_features], dim=1)
+        fused_features = self.block2(concat_features)
+        return fused_features
+class ASPPConv(nn.Sequential):
+    def __init__(self, in_channels, out_channels, dilation):
+        super().__init__(
+            nn.Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size=3,
+                padding=dilation,
+                dilation=dilation,
+                bias=False,
+            ),
+            nn.BatchNorm2d(out_channels),
+            nn.ReLU(),
+        )
+class ASPPSeparableConv(nn.Sequential):
+    def __init__(self, in_channels, out_channels, dilation):
+        super().__init__(
+            SeparableConv2d(
+                in_channels,
+                out_channels,
+                kernel_size=3,
+                padding=dilation,
+                dilation=dilation,
+                bias=False,
+            ),
+            nn.BatchNorm2d(out_channels),
+            nn.ReLU(),
+        )
+class ASPPPooling(nn.Sequential):
+    def __init__(self, in_channels, out_channels):
+        super().__init__(
+            nn.AdaptiveAvgPool2d(1),
+            nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=False),
+            nn.BatchNorm2d(out_channels),
+            nn.ReLU(),
+        )
+    def forward(self, x):
+        size = x.shape[-2:]
+        for mod in self:
+            x = mod(x)
+        return F.interpolate(x, size=size, mode="bilinear", align_corners=False)
+class ASPP(nn.Module):
+    def __init__(self, in_channels, out_channels, atrous_rates, separable=False):
+        super(ASPP, self).__init__()
+        modules = []
+        modules.append(
+            nn.Sequential(
+                nn.Conv2d(in_channels, out_channels, 1, bias=False),
+                nn.BatchNorm2d(out_channels),
+                nn.ReLU(),
+            )
+        )
+        rate1, rate2, rate3 = tuple(atrous_rates)
+        ASPPConvModule = ASPPConv if not separable else ASPPSeparableConv
+        modules.append(ASPPConvModule(in_channels, out_channels, rate1))
+        modules.append(ASPPConvModule(in_channels, out_channels, rate2))
+        modules.append(ASPPConvModule(in_channels, out_channels, rate3))
+        modules.append(ASPPPooling(in_channels, out_channels))
+        self.convs = nn.ModuleList(modules)
+        self.project = nn.Sequential(
+            nn.Conv2d(5 * out_channels, out_channels, kernel_size=1, bias=False),
+            nn.BatchNorm2d(out_channels),
+            nn.ReLU(),
+            nn.Dropout(0.5),
+        )
+    def forward(self, x):
+        res = []
+        for conv in self.convs:
+            res.append(conv(x))
+        res = torch.cat(res, dim=1)
+        return self.project(res)
+class SeparableConv2d(nn.Sequential):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        bias=True,
+    ):
+        dephtwise_conv = nn.Conv2d(
+            in_channels,
+            in_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=in_channels,
+            bias=False,
+        )
+        pointwise_conv = nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=bias)
+        super().__init__(dephtwise_conv, pointwise_conv)

feature_extractor_models/decoders/deeplabv3/model.py ADDED Viewed

	@@ -0,0 +1,178 @@

+from typing import Optional
+from feature_extractor_models.base import (
+    SegmentationModel,
+    SegmentationHead,
+    ClassificationHead,
+)
+from feature_extractor_models.encoders import get_encoder
+from .decoder import DeepLabV3Decoder, DeepLabV3PlusDecoder
+class DeepLabV3(SegmentationModel):
+    """DeepLabV3_ implementation from "Rethinking Atrous Convolution for Semantic Image Segmentation"
+    Args:
+        encoder_name: Name of the classification model that will be used as an encoder (a.k.a backbone)
+            to extract features of different spatial resolution
+        encoder_depth: A number of stages used in encoder in range [3, 5]. Each stage generate features
+            two times smaller in spatial dimensions than previous one (e.g. for depth 0 we will have features
+            with shapes [(N, C, H, W),], for depth 1 - [(N, C, H, W), (N, C, H // 2, W // 2)] and so on).
+            Default is 5
+        encoder_weights: One of **None** (random initialization), **"imagenet"** (pre-training on ImageNet) and
+            other pretrained weights (see table with available weights for each encoder_name)
+        decoder_channels: A number of convolution filters in ASPP module. Default is 256
+        in_channels: A number of input channels for the model, default is 3 (RGB images)
+        classes: A number of classes for output mask (or you can think as a number of channels of output mask)
+        activation: An activation function to apply after the final convolution layer.
+            Available options are **"sigmoid"**, **"softmax"**, **"logsoftmax"**, **"tanh"**, **"identity"**,
+                **callable** and **None**.
+            Default is **None**
+        upsampling: Final upsampling factor. Default is 8 to preserve input-output spatial shape identity
+        aux_params: Dictionary with parameters of the auxiliary output (classification head). Auxiliary output is build
+            on top of encoder if **aux_params** is not **None** (default). Supported params:
+                - classes (int): A number of classes
+                - pooling (str): One of "max", "avg". Default is "avg"
+                - dropout (float): Dropout factor in [0, 1)
+                - activation (str): An activation function to apply "sigmoid"/"softmax"
+                    (could be **None** to return logits)
+    Returns:
+        ``torch.nn.Module``: **DeepLabV3**
+    .. _DeeplabV3:
+        https://arxiv.org/abs/1706.05587
+    """
+    def __init__(
+        self,
+        encoder_name: str = "resnet34",
+        encoder_depth: int = 5,
+        encoder_weights: Optional[str] = "imagenet",
+        decoder_channels: int = 256,
+        in_channels: int = 3,
+        classes: int = 1,
+        activation: Optional[str] = None,
+        upsampling: int = 8,
+        aux_params: Optional[dict] = None,
+    ):
+        super().__init__()
+        self.encoder = get_encoder(
+            encoder_name,
+            in_channels=in_channels,
+            depth=encoder_depth,
+            weights=encoder_weights,
+            output_stride=8,
+        )
+        self.decoder = DeepLabV3Decoder(
+            in_channels=self.encoder.out_channels[-1], out_channels=decoder_channels
+        )
+        self.segmentation_head = SegmentationHead(
+            in_channels=self.decoder.out_channels,
+            out_channels=classes,
+            activation=activation,
+            kernel_size=1,
+            upsampling=upsampling,
+        )
+        if aux_params is not None:
+            self.classification_head = ClassificationHead(
+                in_channels=self.encoder.out_channels[-1], **aux_params
+            )
+        else:
+            self.classification_head = None
+class DeepLabV3Plus(SegmentationModel):
+    """DeepLabV3+ implementation from "Encoder-Decoder with Atrous Separable
+    Convolution for Semantic Image Segmentation"
+    Args:
+        encoder_name: Name of the classification model that will be used as an encoder (a.k.a backbone)
+            to extract features of different spatial resolution
+        encoder_depth: A number of stages used in encoder in range [3, 5]. Each stage generate features
+            two times smaller in spatial dimensions than previous one (e.g. for depth 0 we will have features
+            with shapes [(N, C, H, W),], for depth 1 - [(N, C, H, W), (N, C, H // 2, W // 2)] and so on).
+            Default is 5
+        encoder_weights: One of **None** (random initialization), **"imagenet"** (pre-training on ImageNet) and
+            other pretrained weights (see table with available weights for each encoder_name)
+        encoder_output_stride: Downsampling factor for last encoder features (see original paper for explanation)
+        decoder_atrous_rates: Dilation rates for ASPP module (should be a tuple of 3 integer values)
+        decoder_channels: A number of convolution filters in ASPP module. Default is 256
+        in_channels: A number of input channels for the model, default is 3 (RGB images)
+        classes: A number of classes for output mask (or you can think as a number of channels of output mask)
+        activation: An activation function to apply after the final convolution layer.
+            Available options are **"sigmoid"**, **"softmax"**, **"logsoftmax"**, **"tanh"**, **"identity"**,
+                **callable** and **None**.
+            Default is **None**
+        upsampling: Final upsampling factor. Default is 4 to preserve input-output spatial shape identity
+        aux_params: Dictionary with parameters of the auxiliary output (classification head). Auxiliary output is build
+            on top of encoder if **aux_params** is not **None** (default). Supported params:
+                - classes (int): A number of classes
+                - pooling (str): One of "max", "avg". Default is "avg"
+                - dropout (float): Dropout factor in [0, 1)
+                - activation (str): An activation function to apply "sigmoid"/"softmax"
+                    (could be **None** to return logits)
+    Returns:
+        ``torch.nn.Module``: **DeepLabV3Plus**
+    Reference:
+        https://arxiv.org/abs/1802.02611v3
+    """
+    def __init__(
+        self,
+        encoder_name: str = "resnet34",
+        encoder_depth: int = 5,
+        encoder_weights: Optional[str] = "imagenet",
+        encoder_output_stride: int = 16,
+        decoder_channels: int = 256,
+        decoder_atrous_rates: tuple = (12, 24, 36),
+        in_channels: int = 3,
+        classes: int = 1,
+        activation: Optional[str] = None,
+        upsampling: int = 4,
+        aux_params: Optional[dict] = None,
+    ):
+        super().__init__()
+        if encoder_output_stride not in [8, 16]:
+            raise ValueError(
+                "Encoder output stride should be 8 or 16, got {}".format(
+                    encoder_output_stride
+                )
+            )
+        self.encoder = get_encoder(
+            encoder_name,
+            in_channels=in_channels,
+            depth=encoder_depth,
+            weights=encoder_weights,
+            output_stride=encoder_output_stride,
+        )
+        self.decoder = DeepLabV3PlusDecoder(
+            encoder_channels=self.encoder.out_channels,
+            out_channels=decoder_channels,
+            atrous_rates=decoder_atrous_rates,
+            output_stride=encoder_output_stride,
+        )
+        self.segmentation_head = SegmentationHead(
+            in_channels=self.decoder.out_channels,
+            out_channels=classes,
+            activation=activation,
+            kernel_size=1,
+            upsampling=upsampling,
+        )
+        if aux_params is not None:
+            self.classification_head = ClassificationHead(
+                in_channels=self.encoder.out_channels[-1], **aux_params
+            )
+        else:
+            self.classification_head = None

feature_extractor_models/decoders/fpn/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .model import FPN
2	+
3	+ __all__ = ["FPN"]

feature_extractor_models/decoders/fpn/decoder.py ADDED Viewed

	@@ -0,0 +1,133 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class Conv3x3GNReLU(nn.Module):
+    def __init__(self, in_channels, out_channels, upsample=False):
+        super().__init__()
+        self.upsample = upsample
+        self.block = nn.Sequential(
+            nn.Conv2d(
+                in_channels, out_channels, (3, 3), stride=1, padding=1, bias=False
+            ),
+            nn.GroupNorm(32, out_channels),
+            nn.ReLU(inplace=True),
+        )
+    def forward(self, x):
+        x = self.block(x)
+        if self.upsample:
+            x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
+        return x
+class FPNBlock(nn.Module):
+    def __init__(self, pyramid_channels, skip_channels):
+        super().__init__()
+        self.skip_conv = nn.Conv2d(skip_channels, pyramid_channels, kernel_size=1)
+    def forward(self, x, skip=None):
+        x = F.interpolate(x, scale_factor=2, mode="nearest")
+        skip = self.skip_conv(skip)
+        x = x + skip
+        return x
+class SegmentationBlock(nn.Module):
+    def __init__(self, in_channels, out_channels, n_upsamples=0):
+        super().__init__()
+        blocks = [Conv3x3GNReLU(in_channels, out_channels, upsample=bool(n_upsamples))]
+        if n_upsamples > 1:
+            for _ in range(1, n_upsamples):
+                blocks.append(Conv3x3GNReLU(out_channels, out_channels, upsample=True))
+        self.block = nn.Sequential(*blocks)
+    def forward(self, x):
+        return self.block(x)
+class MergeBlock(nn.Module):
+    def __init__(self, policy):
+        super().__init__()
+        if policy not in ["add", "cat"]:
+            raise ValueError(
+                "`merge_policy` must be one of: ['add', 'cat'], got {}".format(policy)
+            )
+        self.policy = policy
+    def forward(self, x):
+        if self.policy == "add":
+            return sum(x)
+        elif self.policy == "cat":
+            return torch.cat(x, dim=1)
+        else:
+            raise ValueError(
+                "`merge_policy` must be one of: ['add', 'cat'], got {}".format(
+                    self.policy
+                )
+            )
+class FPNDecoder(nn.Module):
+    def __init__(
+        self,
+        encoder_channels,
+        encoder_depth=5,
+        pyramid_channels=256,
+        segmentation_channels=128,
+        dropout=0.2,
+        merge_policy="add",
+    ):
+        super().__init__()
+        self.out_channels = (
+            segmentation_channels
+            if merge_policy == "add"
+            else segmentation_channels * 4
+        )
+        if encoder_depth < 3:
+            raise ValueError(
+                "Encoder depth for FPN decoder cannot be less than 3, got {}.".format(
+                    encoder_depth
+                )
+            )
+        encoder_channels = encoder_channels[::-1]
+        encoder_channels = encoder_channels[: encoder_depth + 1]
+        self.p5 = nn.Conv2d(encoder_channels[0], pyramid_channels, kernel_size=1)
+        self.p4 = FPNBlock(pyramid_channels, encoder_channels[1])
+        self.p3 = FPNBlock(pyramid_channels, encoder_channels[2])
+        self.p2 = FPNBlock(pyramid_channels, encoder_channels[3])
+        self.seg_blocks = nn.ModuleList(
+            [
+                SegmentationBlock(
+                    pyramid_channels, segmentation_channels, n_upsamples=n_upsamples
+                )
+                for n_upsamples in [3, 2, 1, 0]
+            ]
+        )
+        self.merge = MergeBlock(merge_policy)
+        self.dropout = nn.Dropout2d(p=dropout, inplace=True)
+    def forward(self, *features):
+        c2, c3, c4, c5 = features[-4:]
+        p5 = self.p5(c5)
+        p4 = self.p4(p5, c4)
+        p3 = self.p3(p4, c3)
+        p2 = self.p2(p3, c2)
+        feature_pyramid = [
+            seg_block(p) for seg_block, p in zip(self.seg_blocks, [p5, p4, p3, p2])
+        ]
+        x = self.merge(feature_pyramid)
+        x = self.dropout(x)
+        return x

feature_extractor_models/decoders/fpn/model.py ADDED Viewed

	@@ -0,0 +1,107 @@

+from typing import Optional
+from feature_extractor_models.base import (
+    SegmentationModel,
+    SegmentationHead,
+    ClassificationHead,
+)
+from feature_extractor_models.encoders import get_encoder
+from .decoder import FPNDecoder
+class FPN(SegmentationModel):
+    """FPN_ is a fully convolution neural network for image semantic segmentation.
+    Args:
+        encoder_name: Name of the classification model that will be used as an encoder (a.k.a backbone)
+            to extract features of different spatial resolution
+        encoder_depth: A number of stages used in encoder in range [3, 5]. Each stage generate features
+            two times smaller in spatial dimensions than previous one (e.g. for depth 0 we will have features
+            with shapes [(N, C, H, W),], for depth 1 - [(N, C, H, W), (N, C, H // 2, W // 2)] and so on).
+            Default is 5
+        encoder_weights: One of **None** (random initialization), **"imagenet"** (pre-training on ImageNet) and
+            other pretrained weights (see table with available weights for each encoder_name)
+        decoder_pyramid_channels: A number of convolution filters in Feature Pyramid of FPN_
+        decoder_segmentation_channels: A number of convolution filters in segmentation blocks of FPN_
+        decoder_merge_policy: Determines how to merge pyramid features inside FPN. Available options are **add**
+            and **cat**
+        decoder_dropout: Spatial dropout rate in range (0, 1) for feature pyramid in FPN_
+        in_channels: A number of input channels for the model, default is 3 (RGB images)
+        classes: A number of classes for output mask (or you can think as a number of channels of output mask)
+        activation: An activation function to apply after the final convolution layer.
+            Available options are **"sigmoid"**, **"softmax"**, **"logsoftmax"**, **"tanh"**, **"identity"**,
+                **callable** and **None**.
+            Default is **None**
+        upsampling: Final upsampling factor. Default is 4 to preserve input-output spatial shape identity
+        aux_params: Dictionary with parameters of the auxiliary output (classification head). Auxiliary output is build
+            on top of encoder if **aux_params** is not **None** (default). Supported params:
+                - classes (int): A number of classes
+                - pooling (str): One of "max", "avg". Default is "avg"
+                - dropout (float): Dropout factor in [0, 1)
+                - activation (str): An activation function to apply "sigmoid"/"softmax"
+                    (could be **None** to return logits)
+    Returns:
+        ``torch.nn.Module``: **FPN**
+    .. _FPN:
+        http://presentations.cocodataset.org/COCO17-Stuff-FAIR.pdf
+    """
+    def __init__(
+        self,
+        encoder_name: str = "resnet34",
+        encoder_depth: int = 5,
+        encoder_weights: Optional[str] = "imagenet",
+        decoder_pyramid_channels: int = 256,
+        decoder_segmentation_channels: int = 128,
+        decoder_merge_policy: str = "add",
+        decoder_dropout: float = 0.2,
+        in_channels: int = 3,
+        classes: int = 1,
+        activation: Optional[str] = None,
+        upsampling: int = 4,
+        aux_params: Optional[dict] = None,
+    ):
+        super().__init__()
+        # validate input params
+        if encoder_name.startswith("mit_b") and encoder_depth != 5:
+            raise ValueError(
+                "Encoder {} support only encoder_depth=5".format(encoder_name)
+            )
+        self.encoder = get_encoder(
+            encoder_name,
+            in_channels=in_channels,
+            depth=encoder_depth,
+            weights=encoder_weights,
+        )
+        self.decoder = FPNDecoder(
+            encoder_channels=self.encoder.out_channels,
+            encoder_depth=encoder_depth,
+            pyramid_channels=decoder_pyramid_channels,
+            segmentation_channels=decoder_segmentation_channels,
+            dropout=decoder_dropout,
+            merge_policy=decoder_merge_policy,
+        )
+        self.segmentation_head = SegmentationHead(
+            in_channels=self.decoder.out_channels,
+            out_channels=classes,
+            activation=activation,
+            kernel_size=1,
+            upsampling=upsampling,
+        )
+        if aux_params is not None:
+            self.classification_head = ClassificationHead(
+                in_channels=self.encoder.out_channels[-1], **aux_params
+            )
+        else:
+            self.classification_head = None
+        self.name = "fpn-{}".format(encoder_name)
+        self.initialize()

feature_extractor_models/decoders/lightfpn/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .model import LightFPN
2	+
3	+ __all__ = ["LightFPN"]

feature_extractor_models/decoders/lightfpn/decoder.py ADDED Viewed

	@@ -0,0 +1,144 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class Conv3x3GNReLU(nn.Module):
+    def __init__(self, in_channels, out_channels, upsample=False):
+        super().__init__()
+        self.upsample = upsample
+        self.block = nn.Sequential(
+            nn.Conv2d(
+                in_channels, out_channels, (3, 3), stride=1, padding=1, bias=False
+            ),
+            nn.GroupNorm(32, out_channels),
+            nn.ReLU(inplace=True),
+        )
+    def forward(self, x):
+        x = self.block(x)
+        if self.upsample:
+            x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
+        return x
+class DepthwiseSeparableConv2d(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0):
+        super().__init__()
+        self.depthwise = nn.Conv2d(in_channels, in_channels, kernel_size, stride, padding, groups=in_channels)
+        self.pointwise = nn.Conv2d(in_channels, out_channels, 1)
+    def forward(self, x):
+        x = self.depthwise(x)
+        x = self.pointwise(x)
+        return x
+class LightFPNBlock(nn.Module):
+    def __init__(self, pyramid_channels, skip_channels):
+        super().__init__()
+        self.skip_conv = DepthwiseSeparableConv2d(skip_channels, pyramid_channels, kernel_size=1)
+    def forward(self, x, skip=None):
+        x = F.interpolate(x, scale_factor=2, mode="nearest")
+        skip = self.skip_conv(skip)
+        x = x + skip
+        return x
+class SegmentationBlock(nn.Module):
+    def __init__(self, in_channels, out_channels, n_upsamples=0):
+        super().__init__()
+        blocks = [Conv3x3GNReLU(in_channels, out_channels, upsample=bool(n_upsamples))]
+        if n_upsamples > 1:
+            for _ in range(1, n_upsamples):
+                blocks.append(Conv3x3GNReLU(out_channels, out_channels, upsample=True))
+        self.block = nn.Sequential(*blocks)
+    def forward(self, x):
+        return self.block(x)
+class MergeBlock(nn.Module):
+    def __init__(self, policy):
+        super().__init__()
+        if policy not in ["add", "cat"]:
+            raise ValueError(
+                "`merge_policy` must be one of: ['add', 'cat'], got {}".format(policy)
+            )
+        self.policy = policy
+    def forward(self, x):
+        if self.policy == "add":
+            return sum(x)
+        elif self.policy == "cat":
+            return torch.cat(x, dim=1)
+        else:
+            raise ValueError(
+                "`merge_policy` must be one of: ['add', 'cat'], got {}".format(
+                    self.policy
+                )
+            )
+class FPNDecoder(nn.Module):
+    def __init__(
+        self,
+        encoder_channels,
+        encoder_depth=5,
+        pyramid_channels=256,
+        segmentation_channels=128,
+        dropout=0.2,
+        merge_policy="add",
+    ):
+        super().__init__()
+        self.out_channels = (
+            segmentation_channels
+            if merge_policy == "add"
+            else segmentation_channels * 4
+        )
+        if encoder_depth < 3:
+            raise ValueError(
+                "Encoder depth for FPN decoder cannot be less than 3, got {}.".format(
+                    encoder_depth
+                )
+            )
+        encoder_channels = encoder_channels[::-1]
+        encoder_channels = encoder_channels[: encoder_depth + 1]
+        self.p5 = nn.Conv2d(encoder_channels[0], pyramid_channels, kernel_size=1)
+        self.p4 = LightFPNBlock(pyramid_channels, encoder_channels[1])
+        self.p3 = LightFPNBlock(pyramid_channels, encoder_channels[2])
+        self.p2 = LightFPNBlock(pyramid_channels, encoder_channels[3])
+        self.seg_blocks = nn.ModuleList(
+            [
+                SegmentationBlock(
+                    pyramid_channels, segmentation_channels, n_upsamples=n_upsamples
+                )
+                for n_upsamples in [3, 2, 1, 0]
+            ]
+        )
+        self.merge = MergeBlock(merge_policy)
+        self.dropout = nn.Dropout2d(p=dropout, inplace=True)
+    def forward(self, *features):
+        c2, c3, c4, c5 = features[-4:]
+        p5 = self.p5(c5)
+        p4 = self.p4(p5, c4)
+        p3 = self.p3(p4, c3)
+        p2 = self.p2(p3, c2)
+        feature_pyramid = [
+            seg_block(p) for seg_block, p in zip(self.seg_blocks, [p5, p4, p3, p2])
+        ]
+        x = self.merge(feature_pyramid)
+        x = self.dropout(x)
+        return x

feature_extractor_models/decoders/lightfpn/model.py ADDED Viewed

	@@ -0,0 +1,107 @@

+from typing import Optional
+from feature_extractor_models.base import (
+    SegmentationModel,
+    SegmentationHead,
+    ClassificationHead,
+)
+from feature_extractor_models.encoders import get_encoder
+from .decoder import FPNDecoder
+class LightFPN(SegmentationModel):
+    """FPN_ is a fully convolution neural network for image semantic segmentation.
+    Args:
+        encoder_name: Name of the classification model that will be used as an encoder (a.k.a backbone)
+            to extract features of different spatial resolution
+        encoder_depth: A number of stages used in encoder in range [3, 5]. Each stage generate features
+            two times smaller in spatial dimensions than previous one (e.g. for depth 0 we will have features
+            with shapes [(N, C, H, W),], for depth 1 - [(N, C, H, W), (N, C, H // 2, W // 2)] and so on).
+            Default is 5
+        encoder_weights: One of **None** (random initialization), **"imagenet"** (pre-training on ImageNet) and
+            other pretrained weights (see table with available weights for each encoder_name)
+        decoder_pyramid_channels: A number of convolution filters in Feature Pyramid of FPN_
+        decoder_segmentation_channels: A number of convolution filters in segmentation blocks of FPN_
+        decoder_merge_policy: Determines how to merge pyramid features inside FPN. Available options are **add**
+            and **cat**
+        decoder_dropout: Spatial dropout rate in range (0, 1) for feature pyramid in FPN_
+        in_channels: A number of input channels for the model, default is 3 (RGB images)
+        classes: A number of classes for output mask (or you can think as a number of channels of output mask)
+        activation: An activation function to apply after the final convolution layer.
+            Available options are **"sigmoid"**, **"softmax"**, **"logsoftmax"**, **"tanh"**, **"identity"**,
+                **callable** and **None**.
+            Default is **None**
+        upsampling: Final upsampling factor. Default is 4 to preserve input-output spatial shape identity
+        aux_params: Dictionary with parameters of the auxiliary output (classification head). Auxiliary output is build
+            on top of encoder if **aux_params** is not **None** (default). Supported params:
+                - classes (int): A number of classes
+                - pooling (str): One of "max", "avg". Default is "avg"
+                - dropout (float): Dropout factor in [0, 1)
+                - activation (str): An activation function to apply "sigmoid"/"softmax"
+                    (could be **None** to return logits)
+    Returns:
+        ``torch.nn.Module``: **FPN**
+    .. _FPN:
+        http://presentations.cocodataset.org/COCO17-Stuff-FAIR.pdf
+    """
+    def __init__(
+        self,
+        encoder_name: str = "resnet34",
+        encoder_depth: int = 5,
+        encoder_weights: Optional[str] = "imagenet",
+        decoder_pyramid_channels: int = 256,
+        decoder_segmentation_channels: int = 128,
+        decoder_merge_policy: str = "add",
+        decoder_dropout: float = 0.2,
+        in_channels: int = 3,
+        classes: int = 1,
+        activation: Optional[str] = None,
+        upsampling: int = 4,
+        aux_params: Optional[dict] = None,
+    ):
+        super().__init__()
+        # validate input params
+        if encoder_name.startswith("mit_b") and encoder_depth != 5:
+            raise ValueError(
+                "Encoder {} support only encoder_depth=5".format(encoder_name)
+            )
+        self.encoder = get_encoder(
+            encoder_name,
+            in_channels=in_channels,
+            depth=encoder_depth,
+            weights=encoder_weights,
+        )
+        self.decoder = FPNDecoder(
+            encoder_channels=self.encoder.out_channels,
+            encoder_depth=encoder_depth,
+            pyramid_channels=decoder_pyramid_channels,
+            segmentation_channels=decoder_segmentation_channels,
+            dropout=decoder_dropout,
+            merge_policy=decoder_merge_policy,
+        )
+        self.segmentation_head = SegmentationHead(
+            in_channels=self.decoder.out_channels,
+            out_channels=classes,
+            activation=activation,
+            kernel_size=1,
+            upsampling=upsampling,
+        )
+        if aux_params is not None:
+            self.classification_head = ClassificationHead(
+                in_channels=self.encoder.out_channels[-1], **aux_params
+            )
+        else:
+            self.classification_head = None
+        self.name = "fpn-{}".format(encoder_name)
+        self.initialize()

feature_extractor_models/decoders/linknet/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .model import Linknet
2	+
3	+ __all__ = ["Linknet"]

feature_extractor_models/decoders/linknet/decoder.py ADDED Viewed

	@@ -0,0 +1,82 @@

+import torch.nn as nn
+from feature_extractor_models.base import modules
+class TransposeX2(nn.Sequential):
+    def __init__(self, in_channels, out_channels, use_batchnorm=True):
+        super().__init__()
+        layers = [
+            nn.ConvTranspose2d(
+                in_channels, out_channels, kernel_size=4, stride=2, padding=1
+            ),
+            nn.ReLU(inplace=True),
+        ]
+        if use_batchnorm:
+            layers.insert(1, nn.BatchNorm2d(out_channels))
+        super().__init__(*layers)
+class DecoderBlock(nn.Module):
+    def __init__(self, in_channels, out_channels, use_batchnorm=True):
+        super().__init__()
+        self.block = nn.Sequential(
+            modules.Conv2dReLU(
+                in_channels,
+                in_channels // 4,
+                kernel_size=1,
+                use_batchnorm=use_batchnorm,
+            ),
+            TransposeX2(
+                in_channels // 4, in_channels // 4, use_batchnorm=use_batchnorm
+            ),
+            modules.Conv2dReLU(
+                in_channels // 4,
+                out_channels,
+                kernel_size=1,
+                use_batchnorm=use_batchnorm,
+            ),
+        )
+    def forward(self, x, skip=None):
+        x = self.block(x)
+        if skip is not None:
+            x = x + skip
+        return x
+class LinknetDecoder(nn.Module):
+    def __init__(
+        self, encoder_channels, prefinal_channels=32, n_blocks=5, use_batchnorm=True
+    ):
+        super().__init__()
+        # remove first skip
+        encoder_channels = encoder_channels[1:]
+        # reverse channels to start from head of encoder
+        encoder_channels = encoder_channels[::-1]
+        channels = list(encoder_channels) + [prefinal_channels]
+        self.blocks = nn.ModuleList(
+            [
+                DecoderBlock(channels[i], channels[i + 1], use_batchnorm=use_batchnorm)
+                for i in range(n_blocks)
+            ]
+        )
+    def forward(self, *features):
+        features = features[1:]  # remove first skip
+        features = features[::-1]  # reverse channels to start from head of encoder
+        x = features[0]
+        skips = features[1:]
+        for i, decoder_block in enumerate(self.blocks):
+            skip = skips[i] if i < len(skips) else None
+            x = decoder_block(x, skip)
+        return x

feature_extractor_models/decoders/linknet/model.py ADDED Viewed

	@@ -0,0 +1,98 @@

+from typing import Optional, Union
+from feature_extractor_models.base import (
+    SegmentationHead,
+    SegmentationModel,
+    ClassificationHead,
+)
+from feature_extractor_models.encoders import get_encoder
+from .decoder import LinknetDecoder
+class Linknet(SegmentationModel):
+    """Linknet_ is a fully convolution neural network for image semantic segmentation. Consist of *encoder*
+    and *decoder* parts connected with *skip connections*. Encoder extract features of different spatial
+    resolution (skip connections) which are used by decoder to define accurate segmentation mask. Use *sum*
+    for fusing decoder blocks with skip connections.
+    Note:
+        This implementation by default has 4 skip connections (original - 3).
+    Args:
+        encoder_name: Name of the classification model that will be used as an encoder (a.k.a backbone)
+            to extract features of different spatial resolution
+        encoder_depth: A number of stages used in encoder in range [3, 5]. Each stage generate features
+            two times smaller in spatial dimensions than previous one (e.g. for depth 0 we will have features
+            with shapes [(N, C, H, W),], for depth 1 - [(N, C, H, W), (N, C, H // 2, W // 2)] and so on).
+            Default is 5
+        encoder_weights: One of **None** (random initialization), **"imagenet"** (pre-training on ImageNet) and
+            other pretrained weights (see table with available weights for each encoder_name)
+        decoder_use_batchnorm: If **True**, BatchNorm2d layer between Conv2D and Activation layers
+            is used. If **"inplace"** InplaceABN will be used, allows to decrease memory consumption.
+            Available options are **True, False, "inplace"**
+        in_channels: A number of input channels for the model, default is 3 (RGB images)
+        classes: A number of classes for output mask (or you can think as a number of channels of output mask)
+        activation: An activation function to apply after the final convolution layer.
+            Available options are **"sigmoid"**, **"softmax"**, **"logsoftmax"**, **"tanh"**, **"identity"**,
+                **callable** and **None**.
+            Default is **None**
+        aux_params: Dictionary with parameters of the auxiliary output (classification head). Auxiliary output is build
+            on top of encoder if **aux_params** is not **None** (default). Supported params:
+                - classes (int): A number of classes
+                - pooling (str): One of "max", "avg". Default is "avg"
+                - dropout (float): Dropout factor in [0, 1)
+                - activation (str): An activation function to apply "sigmoid"/"softmax"
+                    (could be **None** to return logits)
+    Returns:
+        ``torch.nn.Module``: **Linknet**
+    .. _Linknet:
+        https://arxiv.org/abs/1707.03718
+    """
+    def __init__(
+        self,
+        encoder_name: str = "resnet34",
+        encoder_depth: int = 5,
+        encoder_weights: Optional[str] = "imagenet",
+        decoder_use_batchnorm: bool = True,
+        in_channels: int = 3,
+        classes: int = 1,
+        activation: Optional[Union[str, callable]] = None,
+        aux_params: Optional[dict] = None,
+    ):
+        super().__init__()
+        if encoder_name.startswith("mit_b"):
+            raise ValueError(
+                "Encoder `{}` is not supported for Linknet".format(encoder_name)
+            )
+        self.encoder = get_encoder(
+            encoder_name,
+            in_channels=in_channels,
+            depth=encoder_depth,
+            weights=encoder_weights,
+        )
+        self.decoder = LinknetDecoder(
+            encoder_channels=self.encoder.out_channels,
+            n_blocks=encoder_depth,
+            prefinal_channels=32,
+            use_batchnorm=decoder_use_batchnorm,
+        )
+        self.segmentation_head = SegmentationHead(
+            in_channels=32, out_channels=classes, activation=activation, kernel_size=1
+        )
+        if aux_params is not None:
+            self.classification_head = ClassificationHead(
+                in_channels=self.encoder.out_channels[-1], **aux_params
+            )
+        else:
+            self.classification_head = None
+        self.name = "link-{}".format(encoder_name)
+        self.initialize()

feature_extractor_models/decoders/manet/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .model import MAnet
2	+
3	+ __all__ = ["MAnet"]

feature_extractor_models/decoders/manet/decoder.py ADDED Viewed

	@@ -0,0 +1,187 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from feature_extractor_models.base import modules as md
+class PAB(nn.Module):
+    def __init__(self, in_channels, out_channels, pab_channels=64):
+        super(PAB, self).__init__()
+        # Series of 1x1 conv to generate attention feature maps
+        self.pab_channels = pab_channels
+        self.in_channels = in_channels
+        self.top_conv = nn.Conv2d(in_channels, pab_channels, kernel_size=1)
+        self.center_conv = nn.Conv2d(in_channels, pab_channels, kernel_size=1)
+        self.bottom_conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, padding=1)
+        self.map_softmax = nn.Softmax(dim=1)
+        self.out_conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, padding=1)
+    def forward(self, x):
+        bsize = x.size()[0]
+        h = x.size()[2]
+        w = x.size()[3]
+        x_top = self.top_conv(x)
+        x_center = self.center_conv(x)
+        x_bottom = self.bottom_conv(x)
+        x_top = x_top.flatten(2)
+        x_center = x_center.flatten(2).transpose(1, 2)
+        x_bottom = x_bottom.flatten(2).transpose(1, 2)
+        sp_map = torch.matmul(x_center, x_top)
+        sp_map = self.map_softmax(sp_map.view(bsize, -1)).view(bsize, h * w, h * w)
+        sp_map = torch.matmul(sp_map, x_bottom)
+        sp_map = sp_map.reshape(bsize, self.in_channels, h, w)
+        x = x + sp_map
+        x = self.out_conv(x)
+        return x
+class MFAB(nn.Module):
+    def __init__(
+        self, in_channels, skip_channels, out_channels, use_batchnorm=True, reduction=16
+    ):
+        # MFAB is just a modified version of SE-blocks, one for skip, one for input
+        super(MFAB, self).__init__()
+        self.hl_conv = nn.Sequential(
+            md.Conv2dReLU(
+                in_channels,
+                in_channels,
+                kernel_size=3,
+                padding=1,
+                use_batchnorm=use_batchnorm,
+            ),
+            md.Conv2dReLU(
+                in_channels, skip_channels, kernel_size=1, use_batchnorm=use_batchnorm
+            ),
+        )
+        reduced_channels = max(1, skip_channels // reduction)
+        self.SE_ll = nn.Sequential(
+            nn.AdaptiveAvgPool2d(1),
+            nn.Conv2d(skip_channels, reduced_channels, 1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(reduced_channels, skip_channels, 1),
+            nn.Sigmoid(),
+        )
+        self.SE_hl = nn.Sequential(
+            nn.AdaptiveAvgPool2d(1),
+            nn.Conv2d(skip_channels, reduced_channels, 1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(reduced_channels, skip_channels, 1),
+            nn.Sigmoid(),
+        )
+        self.conv1 = md.Conv2dReLU(
+            skip_channels
+            + skip_channels,  # we transform C-prime form high level to C from skip connection
+            out_channels,
+            kernel_size=3,
+            padding=1,
+            use_batchnorm=use_batchnorm,
+        )
+        self.conv2 = md.Conv2dReLU(
+            out_channels,
+            out_channels,
+            kernel_size=3,
+            padding=1,
+            use_batchnorm=use_batchnorm,
+        )
+    def forward(self, x, skip=None):
+        x = self.hl_conv(x)
+        x = F.interpolate(x, scale_factor=2, mode="nearest")
+        attention_hl = self.SE_hl(x)
+        if skip is not None:
+            attention_ll = self.SE_ll(skip)
+            attention_hl = attention_hl + attention_ll
+            x = x * attention_hl
+            x = torch.cat([x, skip], dim=1)
+        x = self.conv1(x)
+        x = self.conv2(x)
+        return x
+class DecoderBlock(nn.Module):
+    def __init__(self, in_channels, skip_channels, out_channels, use_batchnorm=True):
+        super().__init__()
+        self.conv1 = md.Conv2dReLU(
+            in_channels + skip_channels,
+            out_channels,
+            kernel_size=3,
+            padding=1,
+            use_batchnorm=use_batchnorm,
+        )
+        self.conv2 = md.Conv2dReLU(
+            out_channels,
+            out_channels,
+            kernel_size=3,
+            padding=1,
+            use_batchnorm=use_batchnorm,
+        )
+    def forward(self, x, skip=None):
+        x = F.interpolate(x, scale_factor=2, mode="nearest")
+        if skip is not None:
+            x = torch.cat([x, skip], dim=1)
+        x = self.conv1(x)
+        x = self.conv2(x)
+        return x
+class MAnetDecoder(nn.Module):
+    def __init__(
+        self,
+        encoder_channels,
+        decoder_channels,
+        n_blocks=5,
+        reduction=16,
+        use_batchnorm=True,
+        pab_channels=64,
+    ):
+        super().__init__()
+        if n_blocks != len(decoder_channels):
+            raise ValueError(
+                "Model depth is {}, but you provide `decoder_channels` for {} blocks.".format(
+                    n_blocks, len(decoder_channels)
+                )
+            )
+        # remove first skip with same spatial resolution
+        encoder_channels = encoder_channels[1:]
+        # reverse channels to start from head of encoder
+        encoder_channels = encoder_channels[::-1]
+        # computing blocks input and output channels
+        head_channels = encoder_channels[0]
+        in_channels = [head_channels] + list(decoder_channels[:-1])
+        skip_channels = list(encoder_channels[1:]) + [0]
+        out_channels = decoder_channels
+        self.center = PAB(head_channels, head_channels, pab_channels=pab_channels)
+        # combine decoder keyword arguments
+        kwargs = dict(use_batchnorm=use_batchnorm)  # no attention type here
+        blocks = [
+            MFAB(in_ch, skip_ch, out_ch, reduction=reduction, **kwargs)
+            if skip_ch > 0
+            else DecoderBlock(in_ch, skip_ch, out_ch, **kwargs)
+            for in_ch, skip_ch, out_ch in zip(in_channels, skip_channels, out_channels)
+        ]
+        # for the last we dont have skip connection -> use simple decoder block
+        self.blocks = nn.ModuleList(blocks)
+    def forward(self, *features):
+        features = features[1:]  # remove first skip with same spatial resolution
+        features = features[::-1]  # reverse channels to start from head of encoder
+        head = features[0]
+        skips = features[1:]
+        x = self.center(head)
+        for i, decoder_block in enumerate(self.blocks):
+            skip = skips[i] if i < len(skips) else None
+            x = decoder_block(x, skip)
+        return x

feature_extractor_models/decoders/manet/model.py ADDED Viewed

	@@ -0,0 +1,102 @@

+from typing import Optional, Union, List
+from feature_extractor_models.encoders import get_encoder
+from feature_extractor_models.base import (
+    SegmentationModel,
+    SegmentationHead,
+    ClassificationHead,
+)
+from .decoder import MAnetDecoder
+class MAnet(SegmentationModel):
+    """MAnet_ :  Multi-scale Attention Net. The MA-Net can capture rich contextual dependencies based on
+    the attention mechanism, using two blocks:
+     - Position-wise Attention Block (PAB), which captures the spatial dependencies between pixels in a global view
+     - Multi-scale Fusion Attention Block (MFAB), which  captures the channel dependencies between any feature map by
+       multi-scale semantic feature fusion
+    Args:
+        encoder_name: Name of the classification model that will be used as an encoder (a.k.a backbone)
+            to extract features of different spatial resolution
+        encoder_depth: A number of stages used in encoder in range [3, 5]. Each stage generate features
+            two times smaller in spatial dimensions than previous one (e.g. for depth 0 we will have features
+            with shapes [(N, C, H, W),], for depth 1 - [(N, C, H, W), (N, C, H // 2, W // 2)] and so on).
+            Default is 5
+        encoder_weights: One of **None** (random initialization), **"imagenet"** (pre-training on ImageNet) and
+            other pretrained weights (see table with available weights for each encoder_name)
+        decoder_channels: List of integers which specify **in_channels** parameter for convolutions used in decoder.
+            Length of the list should be the same as **encoder_depth**
+        decoder_use_batchnorm: If **True**, BatchNorm2d layer between Conv2D and Activation layers
+            is used. If **"inplace"** InplaceABN will be used, allows to decrease memory consumption.
+            Available options are **True, False, "inplace"**
+        decoder_pab_channels: A number of channels for PAB module in decoder.
+            Default is 64.
+        in_channels: A number of input channels for the model, default is 3 (RGB images)
+        classes: A number of classes for output mask (or you can think as a number of channels of output mask)
+        activation: An activation function to apply after the final convolution layer.
+            Available options are **"sigmoid"**, **"softmax"**, **"logsoftmax"**, **"tanh"**, **"identity"**,
+                **callable** and **None**.
+            Default is **None**
+        aux_params: Dictionary with parameters of the auxiliary output (classification head). Auxiliary output is build
+            on top of encoder if **aux_params** is not **None** (default). Supported params:
+                - classes (int): A number of classes
+                - pooling (str): One of "max", "avg". Default is "avg"
+                - dropout (float): Dropout factor in [0, 1)
+                - activation (str): An activation function to apply "sigmoid"/"softmax"
+                    (could be **None** to return logits)
+    Returns:
+        ``torch.nn.Module``: **MAnet**
+    .. _MAnet:
+        https://ieeexplore.ieee.org/abstract/document/9201310
+    """
+    def __init__(
+        self,
+        encoder_name: str = "resnet34",
+        encoder_depth: int = 5,
+        encoder_weights: Optional[str] = "imagenet",
+        decoder_use_batchnorm: bool = True,
+        decoder_channels: List[int] = (256, 128, 64, 32, 16),
+        decoder_pab_channels: int = 64,
+        in_channels: int = 3,
+        classes: int = 1,
+        activation: Optional[Union[str, callable]] = None,
+        aux_params: Optional[dict] = None,
+    ):
+        super().__init__()
+        self.encoder = get_encoder(
+            encoder_name,
+            in_channels=in_channels,
+            depth=encoder_depth,
+            weights=encoder_weights,
+        )
+        self.decoder = MAnetDecoder(
+            encoder_channels=self.encoder.out_channels,
+            decoder_channels=decoder_channels,
+            n_blocks=encoder_depth,
+            use_batchnorm=decoder_use_batchnorm,
+            pab_channels=decoder_pab_channels,
+        )
+        self.segmentation_head = SegmentationHead(
+            in_channels=decoder_channels[-1],
+            out_channels=classes,
+            activation=activation,
+            kernel_size=3,
+        )
+        if aux_params is not None:
+            self.classification_head = ClassificationHead(
+                in_channels=self.encoder.out_channels[-1], **aux_params
+            )
+        else:
+            self.classification_head = None
+        self.name = "manet-{}".format(encoder_name)
+        self.initialize()

feature_extractor_models/decoders/pan/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .model import PAN
2	+
3	+ __all__ = ["PAN"]