diff --git a/README.md b/README.md
index 28c904cd34..2443171c86 100644
--- a/README.md
+++ b/README.md
@@ -75,6 +75,7 @@ Supported methods:
- [x] [PSPNet (CVPR'2017)](configs/pspnet)
- [x] [DeepLabV3 (ArXiv'2017)](configs/deeplabv3)
- [x] [Mixed Precision (FP16) Training (ArXiv'2017)](configs/fp16)
+- [x] [BiSeNetV1 (ECCV'2018)](configs/bisenetv1)
- [x] [PSANet (ECCV'2018)](configs/psanet)
- [x] [DeepLabV3+ (CVPR'2018)](configs/deeplabv3plus)
- [x] [UPerNet (ECCV'2018)](configs/upernet)
diff --git a/README_zh-CN.md b/README_zh-CN.md
index cc750f613b..ac90eefeef 100644
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -74,6 +74,7 @@ MMSegmentation 是一个基于 PyTorch 的语义分割开源工具箱。它是 O
- [x] [PSPNet (CVPR'2017)](configs/pspnet)
- [x] [DeepLabV3 (ArXiv'2017)](configs/deeplabv3)
- [x] [Mixed Precision (FP16) Training (ArXiv'2017)](configs/fp16)
+- [x] [BiSeNetV1 (ECCV'2018)](configs/bisenetv1)
- [x] [PSANet (ECCV'2018)](configs/psanet)
- [x] [DeepLabV3+ (CVPR'2018)](configs/deeplabv3plus)
- [x] [UPerNet (ECCV'2018)](configs/upernet)
diff --git a/configs/_base_/models/bisenetv1_r18-d32.py b/configs/_base_/models/bisenetv1_r18-d32.py
new file mode 100644
index 0000000000..40698644ba
--- /dev/null
+++ b/configs/_base_/models/bisenetv1_r18-d32.py
@@ -0,0 +1,68 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+ type='EncoderDecoder',
+ backbone=dict(
+ type='BiSeNetV1',
+ in_channels=3,
+ context_channels=(128, 256, 512),
+ spatial_channels=(64, 64, 64, 128),
+ out_indices=(0, 1, 2),
+ out_channels=256,
+ backbone_cfg=dict(
+ type='ResNet',
+ in_channels=3,
+ depth=18,
+ num_stages=4,
+ out_indices=(0, 1, 2, 3),
+ dilations=(1, 1, 1, 1),
+ strides=(1, 2, 2, 2),
+ norm_cfg=norm_cfg,
+ norm_eval=False,
+ style='pytorch',
+ contract_dilation=True),
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ init_cfg=None),
+ decode_head=dict(
+ type='FCNHead',
+ in_channels=256,
+ in_index=0,
+ channels=256,
+ num_convs=1,
+ concat_input=False,
+ dropout_ratio=0.1,
+ num_classes=19,
+ norm_cfg=norm_cfg,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+ auxiliary_head=[
+ dict(
+ type='FCNHead',
+ in_channels=128,
+ channels=64,
+ num_convs=1,
+ num_classes=19,
+ in_index=1,
+ norm_cfg=norm_cfg,
+ concat_input=False,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+ dict(
+ type='FCNHead',
+ in_channels=128,
+ channels=64,
+ num_convs=1,
+ num_classes=19,
+ in_index=2,
+ norm_cfg=norm_cfg,
+ concat_input=False,
+ align_corners=False,
+ loss_decode=dict(
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+ ],
+ # model training and testing settings
+ train_cfg=dict(),
+ test_cfg=dict(mode='whole'))
diff --git a/configs/bisenetv1/README.md b/configs/bisenetv1/README.md
new file mode 100644
index 0000000000..344781068a
--- /dev/null
+++ b/configs/bisenetv1/README.md
@@ -0,0 +1,42 @@
+# BiSeNet: Bilateral Segmentation Network for Real-time Semantic Segmentation
+
+## Introduction
+
+
+
+Official Repo
+
+Code Snippet
+
+
+BiSeNetV1 (ECCV'2018)
+
+```latex
+@inproceedings{yu2018bisenet,
+ title={Bisenet: Bilateral segmentation network for real-time semantic segmentation},
+ author={Yu, Changqian and Wang, Jingbo and Peng, Chao and Gao, Changxin and Yu, Gang and Sang, Nong},
+ booktitle={Proceedings of the European conference on computer vision (ECCV)},
+ pages={325--341},
+ year={2018}
+}
+```
+
+
+
+## Results and models
+
+### Cityscapes
+
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | mIoU | mIoU(ms+flip) | config | download |
+| --------- | --------- | --------- | ------: | -------- | -------------- | ----: | ------------- | --------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| BiSeNetV1 (No Pretrain) | R-18-D32 | 1024x1024 | 160000 | 5.69 | 31.77 | 74.44 | 77.05 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes_20210922_172239-c55e78e2.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes_20210922_172239.log.json) |
+| BiSeNetV1| R-18-D32 | 1024x1024 | 160000 | 5.69 | 31.77 | 74.37 | 76.91 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210905_220251-8ba80eff.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210905_220251.log.json) |
+| BiSeNetV1 (4x8) | R-18-D32 | 1024x1024 | 160000 | 11.17 | 31.77 | 75.16 | 77.24 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes_20210905_220322-bb8db75f.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes_20210905_220322.log.json) |
+| BiSeNetV1 (No Pretrain) | R-50-D32 | 1024x1024 | 160000 | 3.3 | 7.71 | 76.92 | 78.87 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes_20210923_222639-7b28a2a6.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes_20210923_222639.log.json) |
+| BiSeNetV1 | R-50-D32 | 1024x1024 | 160000 | 15.39 | 7.71 | 77.68 | 79.57 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210917_234628-8b304447.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210917_234628.log.json) |
+
+Note:
+
+- `4x8`: Using 4 GPUs with 8 samples per GPU in training.
+- Default setting is 4 GPUs with 4 samples per GPU in training.
+- `No Pretrain` means the model is trained from scratch.
diff --git a/configs/bisenetv1/bisenetv1.yml b/configs/bisenetv1/bisenetv1.yml
new file mode 100644
index 0000000000..6de872b863
--- /dev/null
+++ b/configs/bisenetv1/bisenetv1.yml
@@ -0,0 +1,125 @@
+Collections:
+- Name: bisenetv1
+ Metadata:
+ Training Data:
+ - Cityscapes
+ Paper:
+ URL: https://arxiv.org/abs/1808.00897
+ Title: 'BiSeNet: Bilateral Segmentation Network for Real-time Semantic Segmentation'
+ README: configs/bisenetv1/README.md
+ Code:
+ URL: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/backbones/bisenetv1.py#L266
+ Version: v0.18.0
+ Converted From:
+ Code: https://github.com/ycszen/TorchSeg/tree/master/model/bisenet
+Models:
+- Name: bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes
+ In Collection: bisenetv1
+ Metadata:
+ backbone: R-18-D32
+ crop size: (1024,1024)
+ lr schd: 160000
+ inference time (ms/im):
+ - value: 31.48
+ hardware: V100
+ backend: PyTorch
+ batch size: 1
+ mode: FP32
+ resolution: (1024,1024)
+ memory (GB): 5.69
+ Results:
+ - Task: Semantic Segmentation
+ Dataset: Cityscapes
+ Metrics:
+ mIoU: 74.44
+ mIoU(ms+flip): 77.05
+ Config: configs/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes.py
+ Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes_20210922_172239-c55e78e2.pth
+- Name: bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes
+ In Collection: bisenetv1
+ Metadata:
+ backbone: R-18-D32
+ crop size: (1024,1024)
+ lr schd: 160000
+ inference time (ms/im):
+ - value: 31.48
+ hardware: V100
+ backend: PyTorch
+ batch size: 1
+ mode: FP32
+ resolution: (1024,1024)
+ memory (GB): 5.69
+ Results:
+ - Task: Semantic Segmentation
+ Dataset: Cityscapes
+ Metrics:
+ mIoU: 74.37
+ mIoU(ms+flip): 76.91
+ Config: configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py
+ Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210905_220251-8ba80eff.pth
+- Name: bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes
+ In Collection: bisenetv1
+ Metadata:
+ backbone: R-18-D32
+ crop size: (1024,1024)
+ lr schd: 160000
+ inference time (ms/im):
+ - value: 31.48
+ hardware: V100
+ backend: PyTorch
+ batch size: 1
+ mode: FP32
+ resolution: (1024,1024)
+ memory (GB): 11.17
+ Results:
+ - Task: Semantic Segmentation
+ Dataset: Cityscapes
+ Metrics:
+ mIoU: 75.16
+ mIoU(ms+flip): 77.24
+ Config: configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes.py
+ Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes_20210905_220322-bb8db75f.pth
+- Name: bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes
+ In Collection: bisenetv1
+ Metadata:
+ backbone: R-50-D32
+ crop size: (1024,1024)
+ lr schd: 160000
+ inference time (ms/im):
+ - value: 129.7
+ hardware: V100
+ backend: PyTorch
+ batch size: 1
+ mode: FP32
+ resolution: (1024,1024)
+ memory (GB): 3.3
+ Results:
+ - Task: Semantic Segmentation
+ Dataset: Cityscapes
+ Metrics:
+ mIoU: 76.92
+ mIoU(ms+flip): 78.87
+ Config: configs/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes.py
+ Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes_20210923_222639-7b28a2a6.pth
+- Name: bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes
+ In Collection: bisenetv1
+ Metadata:
+ backbone: R-50-D32
+ crop size: (1024,1024)
+ lr schd: 160000
+ inference time (ms/im):
+ - value: 129.7
+ hardware: V100
+ backend: PyTorch
+ batch size: 1
+ mode: FP32
+ resolution: (1024,1024)
+ memory (GB): 15.39
+ Results:
+ - Task: Semantic Segmentation
+ Dataset: Cityscapes
+ Metrics:
+ mIoU: 77.68
+ mIoU(ms+flip): 79.57
+ Config: configs/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py
+ Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210917_234628-8b304447.pth
diff --git a/configs/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes.py b/configs/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes.py
new file mode 100644
index 0000000000..f4019e930e
--- /dev/null
+++ b/configs/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes.py
@@ -0,0 +1,11 @@
+_base_ = [
+ '../_base_/models/bisenetv1_r18-d32.py',
+ '../_base_/datasets/cityscapes_1024x1024.py',
+ '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+lr_config = dict(warmup='linear', warmup_iters=1000)
+optimizer = dict(lr=0.025)
+data = dict(
+ samples_per_gpu=4,
+ workers_per_gpu=4,
+)
diff --git a/configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py b/configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py
new file mode 100644
index 0000000000..ef061a16bd
--- /dev/null
+++ b/configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py
@@ -0,0 +1,16 @@
+_base_ = [
+ '../_base_/models/bisenetv1_r18-d32.py',
+ '../_base_/datasets/cityscapes_1024x1024.py',
+ '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+model = dict(
+ backbone=dict(
+ backbone_cfg=dict(
+ init_cfg=dict(
+ type='Pretrained', checkpoint='open-mmlab://resnet18_v1c'))))
+lr_config = dict(warmup='linear', warmup_iters=1000)
+optimizer = dict(lr=0.025)
+data = dict(
+ samples_per_gpu=4,
+ workers_per_gpu=4,
+)
diff --git a/configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes.py b/configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes.py
new file mode 100644
index 0000000000..ea27ef0a11
--- /dev/null
+++ b/configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes.py
@@ -0,0 +1,5 @@
+_base_ = './bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py'
+data = dict(
+ samples_per_gpu=8,
+ workers_per_gpu=8,
+)
diff --git a/configs/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes.py b/configs/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes.py
new file mode 100644
index 0000000000..193438d364
--- /dev/null
+++ b/configs/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes.py
@@ -0,0 +1,46 @@
+_base_ = [
+ '../_base_/models/bisenetv1_r18-d32.py',
+ '../_base_/datasets/cityscapes_1024x1024.py',
+ '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+ type='EncoderDecoder',
+ backbone=dict(
+ type='BiSeNetV1',
+ context_channels=(512, 1024, 2048),
+ spatial_channels=(256, 256, 256, 512),
+ out_channels=1024,
+ backbone_cfg=dict(
+ init_cfg=dict(
+ type='Pretrained', checkpoint='open-mmlab://resnet50_v1c'),
+ type='ResNet',
+ depth=50)),
+ decode_head=dict(
+ type='FCNHead', in_channels=1024, in_index=0, channels=1024),
+ auxiliary_head=[
+ dict(
+ type='FCNHead',
+ in_channels=512,
+ channels=256,
+ num_convs=1,
+ num_classes=19,
+ in_index=1,
+ norm_cfg=norm_cfg,
+ concat_input=False),
+ dict(
+ type='FCNHead',
+ in_channels=512,
+ channels=256,
+ num_convs=1,
+ num_classes=19,
+ in_index=2,
+ norm_cfg=norm_cfg,
+ concat_input=False),
+ ])
+lr_config = dict(warmup='linear', warmup_iters=1000)
+optimizer = dict(lr=0.05)
+data = dict(
+ samples_per_gpu=4,
+ workers_per_gpu=4,
+)
diff --git a/configs/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py b/configs/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py
new file mode 100644
index 0000000000..5625a76c08
--- /dev/null
+++ b/configs/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py
@@ -0,0 +1,7 @@
+_base_ = './bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes.py'
+model = dict(
+ type='EncoderDecoder',
+ backbone=dict(
+ backbone_cfg=dict(
+ init_cfg=dict(
+ type='Pretrained', checkpoint='open-mmlab://resnet50_v1c'))))
diff --git a/mmseg/models/backbones/__init__.py b/mmseg/models/backbones/__init__.py
index 24e2397235..1f88bdda6c 100644
--- a/mmseg/models/backbones/__init__.py
+++ b/mmseg/models/backbones/__init__.py
@@ -1,4 +1,5 @@
# Copyright (c) OpenMMLab. All rights reserved.
+from .bisenetv1 import BiSeNetV1
from .bisenetv2 import BiSeNetV2
from .cgnet import CGNet
from .fast_scnn import FastSCNN
@@ -16,5 +17,6 @@
__all__ = [
'ResNet', 'ResNetV1c', 'ResNetV1d', 'ResNeXt', 'HRNet', 'FastSCNN',
'ResNeSt', 'MobileNetV2', 'UNet', 'CGNet', 'MobileNetV3',
- 'VisionTransformer', 'SwinTransformer', 'MixVisionTransformer', 'BiSeNetV2'
+ 'VisionTransformer', 'SwinTransformer', 'MixVisionTransformer',
+ 'BiSeNetV1', 'BiSeNetV2'
]
diff --git a/mmseg/models/backbones/bisenetv1.py b/mmseg/models/backbones/bisenetv1.py
new file mode 100644
index 0000000000..4beb7b394d
--- /dev/null
+++ b/mmseg/models/backbones/bisenetv1.py
@@ -0,0 +1,332 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmcv.runner import BaseModule
+
+from mmseg.ops import resize
+from ..builder import BACKBONES, build_backbone
+
+
+class SpatialPath(BaseModule):
+ """Spatial Path to preserve the spatial size of the original input image
+ and encode affluent spatial information.
+
+ Args:
+ in_channels(int): The number of channels of input
+ image. Default: 3.
+ num_channels (Tuple[int]): The number of channels of
+ each layers in Spatial Path.
+ Default: (64, 64, 64, 128).
+ Returns:
+ x (torch.Tensor): Feature map for Feature Fusion Module.
+ """
+
+ def __init__(self,
+ in_channels=3,
+ num_channels=(64, 64, 64, 128),
+ conv_cfg=None,
+ norm_cfg=dict(type='BN'),
+ act_cfg=dict(type='ReLU'),
+ init_cfg=None):
+ super(SpatialPath, self).__init__(init_cfg=init_cfg)
+ assert len(num_channels) == 4, 'Length of input channels \
+ of Spatial Path must be 4!'
+
+ self.layers = []
+ for i in range(len(num_channels)):
+ layer_name = f'layer{i + 1}'
+ self.layers.append(layer_name)
+ if i == 0:
+ self.add_module(
+ layer_name,
+ ConvModule(
+ in_channels=in_channels,
+ out_channels=num_channels[i],
+ kernel_size=7,
+ stride=2,
+ padding=3,
+ conv_cfg=conv_cfg,
+ norm_cfg=norm_cfg,
+ act_cfg=act_cfg))
+ elif i == len(num_channels) - 1:
+ self.add_module(
+ layer_name,
+ ConvModule(
+ in_channels=num_channels[i - 1],
+ out_channels=num_channels[i],
+ kernel_size=1,
+ stride=1,
+ padding=0,
+ conv_cfg=conv_cfg,
+ norm_cfg=norm_cfg,
+ act_cfg=act_cfg))
+ else:
+ self.add_module(
+ layer_name,
+ ConvModule(
+ in_channels=num_channels[i - 1],
+ out_channels=num_channels[i],
+ kernel_size=3,
+ stride=2,
+ padding=1,
+ conv_cfg=conv_cfg,
+ norm_cfg=norm_cfg,
+ act_cfg=act_cfg))
+
+ def forward(self, x):
+ for i, layer_name in enumerate(self.layers):
+ layer_stage = getattr(self, layer_name)
+ x = layer_stage(x)
+ return x
+
+
+class AttentionRefinementModule(BaseModule):
+ """Attention Refinement Module (ARM) to refine the features of each stage.
+
+ Args:
+ in_channels (int): The number of input channels.
+ out_channels (int): The number of output channels.
+ Returns:
+ x_out (torch.Tensor): Feature map of Attention Refinement Module.
+ """
+
+ def __init__(self,
+ in_channels,
+ out_channel,
+ conv_cfg=None,
+ norm_cfg=dict(type='BN'),
+ act_cfg=dict(type='ReLU'),
+ init_cfg=None):
+ super(AttentionRefinementModule, self).__init__(init_cfg=init_cfg)
+ self.conv_layer = ConvModule(
+ in_channels=in_channels,
+ out_channels=out_channel,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ conv_cfg=conv_cfg,
+ norm_cfg=norm_cfg,
+ act_cfg=act_cfg)
+ self.atten_conv_layer = nn.Sequential(
+ nn.AdaptiveAvgPool2d((1, 1)),
+ ConvModule(
+ in_channels=out_channel,
+ out_channels=out_channel,
+ kernel_size=1,
+ bias=False,
+ conv_cfg=conv_cfg,
+ norm_cfg=norm_cfg,
+ act_cfg=None), nn.Sigmoid())
+
+ def forward(self, x):
+ x = self.conv_layer(x)
+ x_atten = self.atten_conv_layer(x)
+ x_out = x * x_atten
+ return x_out
+
+
+class ContextPath(BaseModule):
+ """Context Path to provide sufficient receptive field.
+
+ Args:
+ backbone_cfg:(dict): Config of backbone of
+ Context Path.
+ context_channels (Tuple[int]): The number of channel numbers
+ of various modules in Context Path.
+ Default: (128, 256, 512).
+ align_corners (bool, optional): The align_corners argument of
+ resize operation. Default: False.
+ Returns:
+ x_16_up, x_32_up (torch.Tensor, torch.Tensor): Two feature maps
+ undergoing upsampling from 1/16 and 1/32 downsampling
+ feature maps. These two feature maps are used for Feature
+ Fusion Module and Auxiliary Head.
+ """
+
+ def __init__(self,
+ backbone_cfg,
+ context_channels=(128, 256, 512),
+ align_corners=False,
+ conv_cfg=None,
+ norm_cfg=dict(type='BN'),
+ act_cfg=dict(type='ReLU'),
+ init_cfg=None):
+ super(ContextPath, self).__init__(init_cfg=init_cfg)
+ assert len(context_channels) == 3, 'Length of input channels \
+ of Context Path must be 3!'
+
+ self.backbone = build_backbone(backbone_cfg)
+
+ self.align_corners = align_corners
+ self.arm16 = AttentionRefinementModule(context_channels[1],
+ context_channels[0])
+ self.arm32 = AttentionRefinementModule(context_channels[2],
+ context_channels[0])
+ self.conv_head32 = ConvModule(
+ in_channels=context_channels[0],
+ out_channels=context_channels[0],
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ conv_cfg=conv_cfg,
+ norm_cfg=norm_cfg,
+ act_cfg=act_cfg)
+ self.conv_head16 = ConvModule(
+ in_channels=context_channels[0],
+ out_channels=context_channels[0],
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ conv_cfg=conv_cfg,
+ norm_cfg=norm_cfg,
+ act_cfg=act_cfg)
+ self.gap_conv = nn.Sequential(
+ nn.AdaptiveAvgPool2d((1, 1)),
+ ConvModule(
+ in_channels=context_channels[2],
+ out_channels=context_channels[0],
+ kernel_size=1,
+ stride=1,
+ padding=0,
+ conv_cfg=conv_cfg,
+ norm_cfg=norm_cfg,
+ act_cfg=act_cfg))
+
+ def forward(self, x):
+ x_4, x_8, x_16, x_32 = self.backbone(x)
+ x_gap = self.gap_conv(x_32)
+
+ x_32_arm = self.arm32(x_32)
+ x_32_sum = x_32_arm + x_gap
+ x_32_up = resize(input=x_32_sum, size=x_16.shape[2:], mode='nearest')
+ x_32_up = self.conv_head32(x_32_up)
+
+ x_16_arm = self.arm16(x_16)
+ x_16_sum = x_16_arm + x_32_up
+ x_16_up = resize(input=x_16_sum, size=x_8.shape[2:], mode='nearest')
+ x_16_up = self.conv_head16(x_16_up)
+
+ return x_16_up, x_32_up
+
+
+class FeatureFusionModule(BaseModule):
+ """Feature Fusion Module to fuse low level output feature of Spatial Path
+ and high level output feature of Context Path.
+
+ Args:
+ in_channels (int): The number of input channels.
+ out_channels (int): The number of output channels.
+ Returns:
+ x_out (torch.Tensor): Feature map of Feature Fusion Module.
+ """
+
+ def __init__(self,
+ in_channels,
+ out_channels,
+ conv_cfg=None,
+ norm_cfg=dict(type='BN'),
+ act_cfg=dict(type='ReLU'),
+ init_cfg=None):
+ super(FeatureFusionModule, self).__init__(init_cfg=init_cfg)
+ self.conv1 = ConvModule(
+ in_channels=in_channels,
+ out_channels=out_channels,
+ kernel_size=1,
+ stride=1,
+ padding=0,
+ conv_cfg=conv_cfg,
+ norm_cfg=norm_cfg,
+ act_cfg=act_cfg)
+ self.gap = nn.AdaptiveAvgPool2d((1, 1))
+ self.conv_atten = nn.Sequential(
+ ConvModule(
+ in_channels=out_channels,
+ out_channels=out_channels,
+ kernel_size=1,
+ stride=1,
+ padding=0,
+ bias=False,
+ conv_cfg=conv_cfg,
+ norm_cfg=norm_cfg,
+ act_cfg=act_cfg), nn.Sigmoid())
+
+ def forward(self, x_sp, x_cp):
+ x_concat = torch.cat([x_sp, x_cp], dim=1)
+ x_fuse = self.conv1(x_concat)
+ x_atten = self.gap(x_fuse)
+ # Note: No BN and more 1x1 conv in paper.
+ x_atten = self.conv_atten(x_atten)
+ x_atten = x_fuse * x_atten
+ x_out = x_atten + x_fuse
+ return x_out
+
+
+@BACKBONES.register_module()
+class BiSeNetV1(BaseModule):
+ """BiSeNetV1 backbone.
+
+ This backbone is the implementation of `BiSeNet: Bilateral
+ Segmentation Network for Real-time Semantic
+ Segmentation `_.
+
+ Args:
+ backbone_cfg:(dict): Config of backbone of
+ Context Path.
+ in_channels (int): The number of channels of input
+ image. Default: 3.
+ spatial_channels (Tuple[int]): Size of channel numbers of
+ various layers in Spatial Path.
+ Default: (64, 64, 64, 128).
+ context_channels (Tuple[int]): Size of channel numbers of
+ various modules in Context Path.
+ Default: (128, 256, 512).
+ out_indices (Tuple[int] | int, optional): Output from which stages.
+ Default: (0, 1, 2).
+ align_corners (bool, optional): The align_corners argument of
+ resize operation in Bilateral Guided Aggregation Layer.
+ Default: False.
+ out_channels(int): The number of channels of output.
+ It must be the same with `in_channels` of decode_head.
+ Default: 256.
+ """
+
+ def __init__(self,
+ backbone_cfg,
+ in_channels=3,
+ spatial_channels=(64, 64, 64, 128),
+ context_channels=(128, 256, 512),
+ out_indices=(0, 1, 2),
+ align_corners=False,
+ out_channels=256,
+ conv_cfg=None,
+ norm_cfg=dict(type='BN', requires_grad=True),
+ act_cfg=dict(type='ReLU'),
+ init_cfg=None):
+
+ super(BiSeNetV1, self).__init__(init_cfg=init_cfg)
+ assert len(spatial_channels) == 4, 'Length of input channels \
+ of Spatial Path must be 4!'
+
+ assert len(context_channels) == 3, 'Length of input channels \
+ of Context Path must be 3!'
+
+ self.out_indices = out_indices
+ self.align_corners = align_corners
+ self.context_path = ContextPath(backbone_cfg, context_channels,
+ self.align_corners)
+ self.spatial_path = SpatialPath(in_channels, spatial_channels)
+ self.ffm = FeatureFusionModule(context_channels[1], out_channels)
+ self.conv_cfg = conv_cfg
+ self.norm_cfg = norm_cfg
+ self.act_cfg = act_cfg
+
+ def forward(self, x):
+ # stole refactoring code from Coin Cheung, thanks
+ x_context8, x_context16 = self.context_path(x)
+ x_spatial = self.spatial_path(x)
+ x_fuse = self.ffm(x_spatial, x_context8)
+
+ outs = [x_fuse, x_context8, x_context16]
+ outs = [outs[i] for i in self.out_indices]
+ return tuple(outs)
diff --git a/model-index.yml b/model-index.yml
index 1fa927ad92..7d18380c76 100644
--- a/model-index.yml
+++ b/model-index.yml
@@ -1,6 +1,7 @@
Import:
- configs/ann/ann.yml
- configs/apcnet/apcnet.yml
+- configs/bisenetv1/bisenetv1.yml
- configs/bisenetv2/bisenetv2.yml
- configs/ccnet/ccnet.yml
- configs/cgnet/cgnet.yml
diff --git a/tests/test_models/test_backbones/test_bisenetv1.py b/tests/test_models/test_backbones/test_bisenetv1.py
new file mode 100644
index 0000000000..8e1571d6fb
--- /dev/null
+++ b/tests/test_models/test_backbones/test_bisenetv1.py
@@ -0,0 +1,109 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmseg.models.backbones import BiSeNetV1
+from mmseg.models.backbones.bisenetv1 import (AttentionRefinementModule,
+ ContextPath, FeatureFusionModule,
+ SpatialPath)
+
+
+def test_bisenetv1_backbone():
+ # Test BiSeNetV1 Standard Forward
+ backbone_cfg = dict(
+ type='ResNet',
+ in_channels=3,
+ depth=18,
+ num_stages=4,
+ out_indices=(0, 1, 2, 3),
+ dilations=(1, 1, 1, 1),
+ strides=(1, 2, 2, 2),
+ norm_eval=False,
+ style='pytorch',
+ contract_dilation=True)
+ model = BiSeNetV1(in_channels=3, backbone_cfg=backbone_cfg)
+ model.init_weights()
+ model.train()
+ batch_size = 2
+ imgs = torch.randn(batch_size, 3, 256, 512)
+ feat = model(imgs)
+
+ assert len(feat) == 3
+ # output for segment Head
+ assert feat[0].shape == torch.Size([batch_size, 256, 32, 64])
+ # for auxiliary head 1
+ assert feat[1].shape == torch.Size([batch_size, 128, 32, 64])
+ # for auxiliary head 2
+ assert feat[2].shape == torch.Size([batch_size, 128, 16, 32])
+
+ # Test input with rare shape
+ batch_size = 2
+ imgs = torch.randn(batch_size, 3, 527, 279)
+ feat = model(imgs)
+ assert len(feat) == 3
+
+ with pytest.raises(AssertionError):
+ # BiSeNetV1 spatial path channel constraints.
+ BiSeNetV1(
+ backbone_cfg=backbone_cfg,
+ in_channels=3,
+ spatial_channels=(64, 64, 64))
+
+ with pytest.raises(AssertionError):
+ # BiSeNetV1 context path constraints.
+ BiSeNetV1(
+ backbone_cfg=backbone_cfg,
+ in_channels=3,
+ context_channels=(128, 256, 512, 1024))
+
+
+def test_bisenetv1_spatial_path():
+ with pytest.raises(AssertionError):
+ # BiSeNetV1 spatial path channel constraints.
+ SpatialPath(num_channels=(64, 64, 64), in_channels=3)
+
+
+def test_bisenetv1_context_path():
+ backbone_cfg = dict(
+ type='ResNet',
+ in_channels=3,
+ depth=50,
+ num_stages=4,
+ out_indices=(0, 1, 2, 3),
+ dilations=(1, 1, 1, 1),
+ strides=(1, 2, 2, 2),
+ norm_eval=False,
+ style='pytorch',
+ contract_dilation=True)
+
+ with pytest.raises(AssertionError):
+ # BiSeNetV1 context path constraints.
+ ContextPath(
+ backbone_cfg=backbone_cfg, context_channels=(128, 256, 512, 1024))
+
+
+def test_bisenetv1_attention_refinement_module():
+ x_arm = AttentionRefinementModule(256, 64)
+ assert x_arm.conv_layer.in_channels == 256
+ assert x_arm.conv_layer.out_channels == 64
+ assert x_arm.conv_layer.kernel_size == (3, 3)
+ x = torch.randn(2, 256, 32, 64)
+ x_out = x_arm(x)
+ assert x_out.shape == torch.Size([2, 64, 32, 64])
+
+
+def test_bisenetv1_feature_fusion_module():
+ ffm = FeatureFusionModule(128, 256)
+ assert ffm.conv1.in_channels == 128
+ assert ffm.conv1.out_channels == 256
+ assert ffm.conv1.kernel_size == (1, 1)
+ assert ffm.gap.output_size == (1, 1)
+ assert ffm.conv_atten[0].in_channels == 256
+ assert ffm.conv_atten[0].out_channels == 256
+ assert ffm.conv_atten[0].kernel_size == (1, 1)
+
+ ffm = FeatureFusionModule(128, 128)
+ x1 = torch.randn(2, 64, 64, 128)
+ x2 = torch.randn(2, 64, 64, 128)
+ x_out = ffm(x1, x2)
+ assert x_out.shape == torch.Size([2, 128, 64, 128])