diff --git a/README.md b/README.md index 28c904cd34..2443171c86 100644 --- a/README.md +++ b/README.md @@ -75,6 +75,7 @@ Supported methods: - [x] [PSPNet (CVPR'2017)](configs/pspnet) - [x] [DeepLabV3 (ArXiv'2017)](configs/deeplabv3) - [x] [Mixed Precision (FP16) Training (ArXiv'2017)](configs/fp16) +- [x] [BiSeNetV1 (ECCV'2018)](configs/bisenetv1) - [x] [PSANet (ECCV'2018)](configs/psanet) - [x] [DeepLabV3+ (CVPR'2018)](configs/deeplabv3plus) - [x] [UPerNet (ECCV'2018)](configs/upernet) diff --git a/README_zh-CN.md b/README_zh-CN.md index cc750f613b..ac90eefeef 100644 --- a/README_zh-CN.md +++ b/README_zh-CN.md @@ -74,6 +74,7 @@ MMSegmentation 是一个基于 PyTorch 的语义分割开源工具箱。它是 O - [x] [PSPNet (CVPR'2017)](configs/pspnet) - [x] [DeepLabV3 (ArXiv'2017)](configs/deeplabv3) - [x] [Mixed Precision (FP16) Training (ArXiv'2017)](configs/fp16) +- [x] [BiSeNetV1 (ECCV'2018)](configs/bisenetv1) - [x] [PSANet (ECCV'2018)](configs/psanet) - [x] [DeepLabV3+ (CVPR'2018)](configs/deeplabv3plus) - [x] [UPerNet (ECCV'2018)](configs/upernet) diff --git a/configs/_base_/models/bisenetv1_r18-d32.py b/configs/_base_/models/bisenetv1_r18-d32.py new file mode 100644 index 0000000000..40698644ba --- /dev/null +++ b/configs/_base_/models/bisenetv1_r18-d32.py @@ -0,0 +1,68 @@ +# model settings +norm_cfg = dict(type='SyncBN', requires_grad=True) +model = dict( + type='EncoderDecoder', + backbone=dict( + type='BiSeNetV1', + in_channels=3, + context_channels=(128, 256, 512), + spatial_channels=(64, 64, 64, 128), + out_indices=(0, 1, 2), + out_channels=256, + backbone_cfg=dict( + type='ResNet', + in_channels=3, + depth=18, + num_stages=4, + out_indices=(0, 1, 2, 3), + dilations=(1, 1, 1, 1), + strides=(1, 2, 2, 2), + norm_cfg=norm_cfg, + norm_eval=False, + style='pytorch', + contract_dilation=True), + norm_cfg=norm_cfg, + align_corners=False, + init_cfg=None), + decode_head=dict( + type='FCNHead', + in_channels=256, + in_index=0, + channels=256, + num_convs=1, + concat_input=False, + dropout_ratio=0.1, + num_classes=19, + norm_cfg=norm_cfg, + align_corners=False, + loss_decode=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), + auxiliary_head=[ + dict( + type='FCNHead', + in_channels=128, + channels=64, + num_convs=1, + num_classes=19, + in_index=1, + norm_cfg=norm_cfg, + concat_input=False, + align_corners=False, + loss_decode=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), + dict( + type='FCNHead', + in_channels=128, + channels=64, + num_convs=1, + num_classes=19, + in_index=2, + norm_cfg=norm_cfg, + concat_input=False, + align_corners=False, + loss_decode=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), + ], + # model training and testing settings + train_cfg=dict(), + test_cfg=dict(mode='whole')) diff --git a/configs/bisenetv1/README.md b/configs/bisenetv1/README.md new file mode 100644 index 0000000000..344781068a --- /dev/null +++ b/configs/bisenetv1/README.md @@ -0,0 +1,42 @@ +# BiSeNet: Bilateral Segmentation Network for Real-time Semantic Segmentation + +## Introduction + + + +Official Repo + +Code Snippet + +
+BiSeNetV1 (ECCV'2018) + +```latex +@inproceedings{yu2018bisenet, + title={Bisenet: Bilateral segmentation network for real-time semantic segmentation}, + author={Yu, Changqian and Wang, Jingbo and Peng, Chao and Gao, Changxin and Yu, Gang and Sang, Nong}, + booktitle={Proceedings of the European conference on computer vision (ECCV)}, + pages={325--341}, + year={2018} +} +``` + +
+ +## Results and models + +### Cityscapes + +| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | mIoU | mIoU(ms+flip) | config | download | +| --------- | --------- | --------- | ------: | -------- | -------------- | ----: | ------------- | --------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| BiSeNetV1 (No Pretrain) | R-18-D32 | 1024x1024 | 160000 | 5.69 | 31.77 | 74.44 | 77.05 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes_20210922_172239-c55e78e2.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes_20210922_172239.log.json) | +| BiSeNetV1| R-18-D32 | 1024x1024 | 160000 | 5.69 | 31.77 | 74.37 | 76.91 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210905_220251-8ba80eff.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210905_220251.log.json) | +| BiSeNetV1 (4x8) | R-18-D32 | 1024x1024 | 160000 | 11.17 | 31.77 | 75.16 | 77.24 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes_20210905_220322-bb8db75f.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes_20210905_220322.log.json) | +| BiSeNetV1 (No Pretrain) | R-50-D32 | 1024x1024 | 160000 | 3.3 | 7.71 | 76.92 | 78.87 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes_20210923_222639-7b28a2a6.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes_20210923_222639.log.json) | +| BiSeNetV1 | R-50-D32 | 1024x1024 | 160000 | 15.39 | 7.71 | 77.68 | 79.57 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210917_234628-8b304447.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210917_234628.log.json) | + +Note: + +- `4x8`: Using 4 GPUs with 8 samples per GPU in training. +- Default setting is 4 GPUs with 4 samples per GPU in training. +- `No Pretrain` means the model is trained from scratch. diff --git a/configs/bisenetv1/bisenetv1.yml b/configs/bisenetv1/bisenetv1.yml new file mode 100644 index 0000000000..6de872b863 --- /dev/null +++ b/configs/bisenetv1/bisenetv1.yml @@ -0,0 +1,125 @@ +Collections: +- Name: bisenetv1 + Metadata: + Training Data: + - Cityscapes + Paper: + URL: https://arxiv.org/abs/1808.00897 + Title: 'BiSeNet: Bilateral Segmentation Network for Real-time Semantic Segmentation' + README: configs/bisenetv1/README.md + Code: + URL: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/backbones/bisenetv1.py#L266 + Version: v0.18.0 + Converted From: + Code: https://github.com/ycszen/TorchSeg/tree/master/model/bisenet +Models: +- Name: bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes + In Collection: bisenetv1 + Metadata: + backbone: R-18-D32 + crop size: (1024,1024) + lr schd: 160000 + inference time (ms/im): + - value: 31.48 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (1024,1024) + memory (GB): 5.69 + Results: + - Task: Semantic Segmentation + Dataset: Cityscapes + Metrics: + mIoU: 74.44 + mIoU(ms+flip): 77.05 + Config: configs/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes.py + Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes_20210922_172239-c55e78e2.pth +- Name: bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes + In Collection: bisenetv1 + Metadata: + backbone: R-18-D32 + crop size: (1024,1024) + lr schd: 160000 + inference time (ms/im): + - value: 31.48 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (1024,1024) + memory (GB): 5.69 + Results: + - Task: Semantic Segmentation + Dataset: Cityscapes + Metrics: + mIoU: 74.37 + mIoU(ms+flip): 76.91 + Config: configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py + Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210905_220251-8ba80eff.pth +- Name: bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes + In Collection: bisenetv1 + Metadata: + backbone: R-18-D32 + crop size: (1024,1024) + lr schd: 160000 + inference time (ms/im): + - value: 31.48 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (1024,1024) + memory (GB): 11.17 + Results: + - Task: Semantic Segmentation + Dataset: Cityscapes + Metrics: + mIoU: 75.16 + mIoU(ms+flip): 77.24 + Config: configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes.py + Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes_20210905_220322-bb8db75f.pth +- Name: bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes + In Collection: bisenetv1 + Metadata: + backbone: R-50-D32 + crop size: (1024,1024) + lr schd: 160000 + inference time (ms/im): + - value: 129.7 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (1024,1024) + memory (GB): 3.3 + Results: + - Task: Semantic Segmentation + Dataset: Cityscapes + Metrics: + mIoU: 76.92 + mIoU(ms+flip): 78.87 + Config: configs/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes.py + Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes_20210923_222639-7b28a2a6.pth +- Name: bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes + In Collection: bisenetv1 + Metadata: + backbone: R-50-D32 + crop size: (1024,1024) + lr schd: 160000 + inference time (ms/im): + - value: 129.7 + hardware: V100 + backend: PyTorch + batch size: 1 + mode: FP32 + resolution: (1024,1024) + memory (GB): 15.39 + Results: + - Task: Semantic Segmentation + Dataset: Cityscapes + Metrics: + mIoU: 77.68 + mIoU(ms+flip): 79.57 + Config: configs/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py + Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210917_234628-8b304447.pth diff --git a/configs/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes.py b/configs/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes.py new file mode 100644 index 0000000000..f4019e930e --- /dev/null +++ b/configs/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes.py @@ -0,0 +1,11 @@ +_base_ = [ + '../_base_/models/bisenetv1_r18-d32.py', + '../_base_/datasets/cityscapes_1024x1024.py', + '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py' +] +lr_config = dict(warmup='linear', warmup_iters=1000) +optimizer = dict(lr=0.025) +data = dict( + samples_per_gpu=4, + workers_per_gpu=4, +) diff --git a/configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py b/configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py new file mode 100644 index 0000000000..ef061a16bd --- /dev/null +++ b/configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py @@ -0,0 +1,16 @@ +_base_ = [ + '../_base_/models/bisenetv1_r18-d32.py', + '../_base_/datasets/cityscapes_1024x1024.py', + '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py' +] +model = dict( + backbone=dict( + backbone_cfg=dict( + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnet18_v1c')))) +lr_config = dict(warmup='linear', warmup_iters=1000) +optimizer = dict(lr=0.025) +data = dict( + samples_per_gpu=4, + workers_per_gpu=4, +) diff --git a/configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes.py b/configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes.py new file mode 100644 index 0000000000..ea27ef0a11 --- /dev/null +++ b/configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes.py @@ -0,0 +1,5 @@ +_base_ = './bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py' +data = dict( + samples_per_gpu=8, + workers_per_gpu=8, +) diff --git a/configs/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes.py b/configs/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes.py new file mode 100644 index 0000000000..193438d364 --- /dev/null +++ b/configs/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes.py @@ -0,0 +1,46 @@ +_base_ = [ + '../_base_/models/bisenetv1_r18-d32.py', + '../_base_/datasets/cityscapes_1024x1024.py', + '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py' +] +norm_cfg = dict(type='SyncBN', requires_grad=True) +model = dict( + type='EncoderDecoder', + backbone=dict( + type='BiSeNetV1', + context_channels=(512, 1024, 2048), + spatial_channels=(256, 256, 256, 512), + out_channels=1024, + backbone_cfg=dict( + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnet50_v1c'), + type='ResNet', + depth=50)), + decode_head=dict( + type='FCNHead', in_channels=1024, in_index=0, channels=1024), + auxiliary_head=[ + dict( + type='FCNHead', + in_channels=512, + channels=256, + num_convs=1, + num_classes=19, + in_index=1, + norm_cfg=norm_cfg, + concat_input=False), + dict( + type='FCNHead', + in_channels=512, + channels=256, + num_convs=1, + num_classes=19, + in_index=2, + norm_cfg=norm_cfg, + concat_input=False), + ]) +lr_config = dict(warmup='linear', warmup_iters=1000) +optimizer = dict(lr=0.05) +data = dict( + samples_per_gpu=4, + workers_per_gpu=4, +) diff --git a/configs/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py b/configs/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py new file mode 100644 index 0000000000..5625a76c08 --- /dev/null +++ b/configs/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py @@ -0,0 +1,7 @@ +_base_ = './bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes.py' +model = dict( + type='EncoderDecoder', + backbone=dict( + backbone_cfg=dict( + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnet50_v1c')))) diff --git a/mmseg/models/backbones/__init__.py b/mmseg/models/backbones/__init__.py index 24e2397235..1f88bdda6c 100644 --- a/mmseg/models/backbones/__init__.py +++ b/mmseg/models/backbones/__init__.py @@ -1,4 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. +from .bisenetv1 import BiSeNetV1 from .bisenetv2 import BiSeNetV2 from .cgnet import CGNet from .fast_scnn import FastSCNN @@ -16,5 +17,6 @@ __all__ = [ 'ResNet', 'ResNetV1c', 'ResNetV1d', 'ResNeXt', 'HRNet', 'FastSCNN', 'ResNeSt', 'MobileNetV2', 'UNet', 'CGNet', 'MobileNetV3', - 'VisionTransformer', 'SwinTransformer', 'MixVisionTransformer', 'BiSeNetV2' + 'VisionTransformer', 'SwinTransformer', 'MixVisionTransformer', + 'BiSeNetV1', 'BiSeNetV2' ] diff --git a/mmseg/models/backbones/bisenetv1.py b/mmseg/models/backbones/bisenetv1.py new file mode 100644 index 0000000000..4beb7b394d --- /dev/null +++ b/mmseg/models/backbones/bisenetv1.py @@ -0,0 +1,332 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn +from mmcv.cnn import ConvModule +from mmcv.runner import BaseModule + +from mmseg.ops import resize +from ..builder import BACKBONES, build_backbone + + +class SpatialPath(BaseModule): + """Spatial Path to preserve the spatial size of the original input image + and encode affluent spatial information. + + Args: + in_channels(int): The number of channels of input + image. Default: 3. + num_channels (Tuple[int]): The number of channels of + each layers in Spatial Path. + Default: (64, 64, 64, 128). + Returns: + x (torch.Tensor): Feature map for Feature Fusion Module. + """ + + def __init__(self, + in_channels=3, + num_channels=(64, 64, 64, 128), + conv_cfg=None, + norm_cfg=dict(type='BN'), + act_cfg=dict(type='ReLU'), + init_cfg=None): + super(SpatialPath, self).__init__(init_cfg=init_cfg) + assert len(num_channels) == 4, 'Length of input channels \ + of Spatial Path must be 4!' + + self.layers = [] + for i in range(len(num_channels)): + layer_name = f'layer{i + 1}' + self.layers.append(layer_name) + if i == 0: + self.add_module( + layer_name, + ConvModule( + in_channels=in_channels, + out_channels=num_channels[i], + kernel_size=7, + stride=2, + padding=3, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg)) + elif i == len(num_channels) - 1: + self.add_module( + layer_name, + ConvModule( + in_channels=num_channels[i - 1], + out_channels=num_channels[i], + kernel_size=1, + stride=1, + padding=0, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg)) + else: + self.add_module( + layer_name, + ConvModule( + in_channels=num_channels[i - 1], + out_channels=num_channels[i], + kernel_size=3, + stride=2, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg)) + + def forward(self, x): + for i, layer_name in enumerate(self.layers): + layer_stage = getattr(self, layer_name) + x = layer_stage(x) + return x + + +class AttentionRefinementModule(BaseModule): + """Attention Refinement Module (ARM) to refine the features of each stage. + + Args: + in_channels (int): The number of input channels. + out_channels (int): The number of output channels. + Returns: + x_out (torch.Tensor): Feature map of Attention Refinement Module. + """ + + def __init__(self, + in_channels, + out_channel, + conv_cfg=None, + norm_cfg=dict(type='BN'), + act_cfg=dict(type='ReLU'), + init_cfg=None): + super(AttentionRefinementModule, self).__init__(init_cfg=init_cfg) + self.conv_layer = ConvModule( + in_channels=in_channels, + out_channels=out_channel, + kernel_size=3, + stride=1, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + self.atten_conv_layer = nn.Sequential( + nn.AdaptiveAvgPool2d((1, 1)), + ConvModule( + in_channels=out_channel, + out_channels=out_channel, + kernel_size=1, + bias=False, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=None), nn.Sigmoid()) + + def forward(self, x): + x = self.conv_layer(x) + x_atten = self.atten_conv_layer(x) + x_out = x * x_atten + return x_out + + +class ContextPath(BaseModule): + """Context Path to provide sufficient receptive field. + + Args: + backbone_cfg:(dict): Config of backbone of + Context Path. + context_channels (Tuple[int]): The number of channel numbers + of various modules in Context Path. + Default: (128, 256, 512). + align_corners (bool, optional): The align_corners argument of + resize operation. Default: False. + Returns: + x_16_up, x_32_up (torch.Tensor, torch.Tensor): Two feature maps + undergoing upsampling from 1/16 and 1/32 downsampling + feature maps. These two feature maps are used for Feature + Fusion Module and Auxiliary Head. + """ + + def __init__(self, + backbone_cfg, + context_channels=(128, 256, 512), + align_corners=False, + conv_cfg=None, + norm_cfg=dict(type='BN'), + act_cfg=dict(type='ReLU'), + init_cfg=None): + super(ContextPath, self).__init__(init_cfg=init_cfg) + assert len(context_channels) == 3, 'Length of input channels \ + of Context Path must be 3!' + + self.backbone = build_backbone(backbone_cfg) + + self.align_corners = align_corners + self.arm16 = AttentionRefinementModule(context_channels[1], + context_channels[0]) + self.arm32 = AttentionRefinementModule(context_channels[2], + context_channels[0]) + self.conv_head32 = ConvModule( + in_channels=context_channels[0], + out_channels=context_channels[0], + kernel_size=3, + stride=1, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + self.conv_head16 = ConvModule( + in_channels=context_channels[0], + out_channels=context_channels[0], + kernel_size=3, + stride=1, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + self.gap_conv = nn.Sequential( + nn.AdaptiveAvgPool2d((1, 1)), + ConvModule( + in_channels=context_channels[2], + out_channels=context_channels[0], + kernel_size=1, + stride=1, + padding=0, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg)) + + def forward(self, x): + x_4, x_8, x_16, x_32 = self.backbone(x) + x_gap = self.gap_conv(x_32) + + x_32_arm = self.arm32(x_32) + x_32_sum = x_32_arm + x_gap + x_32_up = resize(input=x_32_sum, size=x_16.shape[2:], mode='nearest') + x_32_up = self.conv_head32(x_32_up) + + x_16_arm = self.arm16(x_16) + x_16_sum = x_16_arm + x_32_up + x_16_up = resize(input=x_16_sum, size=x_8.shape[2:], mode='nearest') + x_16_up = self.conv_head16(x_16_up) + + return x_16_up, x_32_up + + +class FeatureFusionModule(BaseModule): + """Feature Fusion Module to fuse low level output feature of Spatial Path + and high level output feature of Context Path. + + Args: + in_channels (int): The number of input channels. + out_channels (int): The number of output channels. + Returns: + x_out (torch.Tensor): Feature map of Feature Fusion Module. + """ + + def __init__(self, + in_channels, + out_channels, + conv_cfg=None, + norm_cfg=dict(type='BN'), + act_cfg=dict(type='ReLU'), + init_cfg=None): + super(FeatureFusionModule, self).__init__(init_cfg=init_cfg) + self.conv1 = ConvModule( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + stride=1, + padding=0, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + self.gap = nn.AdaptiveAvgPool2d((1, 1)) + self.conv_atten = nn.Sequential( + ConvModule( + in_channels=out_channels, + out_channels=out_channels, + kernel_size=1, + stride=1, + padding=0, + bias=False, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg), nn.Sigmoid()) + + def forward(self, x_sp, x_cp): + x_concat = torch.cat([x_sp, x_cp], dim=1) + x_fuse = self.conv1(x_concat) + x_atten = self.gap(x_fuse) + # Note: No BN and more 1x1 conv in paper. + x_atten = self.conv_atten(x_atten) + x_atten = x_fuse * x_atten + x_out = x_atten + x_fuse + return x_out + + +@BACKBONES.register_module() +class BiSeNetV1(BaseModule): + """BiSeNetV1 backbone. + + This backbone is the implementation of `BiSeNet: Bilateral + Segmentation Network for Real-time Semantic + Segmentation `_. + + Args: + backbone_cfg:(dict): Config of backbone of + Context Path. + in_channels (int): The number of channels of input + image. Default: 3. + spatial_channels (Tuple[int]): Size of channel numbers of + various layers in Spatial Path. + Default: (64, 64, 64, 128). + context_channels (Tuple[int]): Size of channel numbers of + various modules in Context Path. + Default: (128, 256, 512). + out_indices (Tuple[int] | int, optional): Output from which stages. + Default: (0, 1, 2). + align_corners (bool, optional): The align_corners argument of + resize operation in Bilateral Guided Aggregation Layer. + Default: False. + out_channels(int): The number of channels of output. + It must be the same with `in_channels` of decode_head. + Default: 256. + """ + + def __init__(self, + backbone_cfg, + in_channels=3, + spatial_channels=(64, 64, 64, 128), + context_channels=(128, 256, 512), + out_indices=(0, 1, 2), + align_corners=False, + out_channels=256, + conv_cfg=None, + norm_cfg=dict(type='BN', requires_grad=True), + act_cfg=dict(type='ReLU'), + init_cfg=None): + + super(BiSeNetV1, self).__init__(init_cfg=init_cfg) + assert len(spatial_channels) == 4, 'Length of input channels \ + of Spatial Path must be 4!' + + assert len(context_channels) == 3, 'Length of input channels \ + of Context Path must be 3!' + + self.out_indices = out_indices + self.align_corners = align_corners + self.context_path = ContextPath(backbone_cfg, context_channels, + self.align_corners) + self.spatial_path = SpatialPath(in_channels, spatial_channels) + self.ffm = FeatureFusionModule(context_channels[1], out_channels) + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + + def forward(self, x): + # stole refactoring code from Coin Cheung, thanks + x_context8, x_context16 = self.context_path(x) + x_spatial = self.spatial_path(x) + x_fuse = self.ffm(x_spatial, x_context8) + + outs = [x_fuse, x_context8, x_context16] + outs = [outs[i] for i in self.out_indices] + return tuple(outs) diff --git a/model-index.yml b/model-index.yml index 1fa927ad92..7d18380c76 100644 --- a/model-index.yml +++ b/model-index.yml @@ -1,6 +1,7 @@ Import: - configs/ann/ann.yml - configs/apcnet/apcnet.yml +- configs/bisenetv1/bisenetv1.yml - configs/bisenetv2/bisenetv2.yml - configs/ccnet/ccnet.yml - configs/cgnet/cgnet.yml diff --git a/tests/test_models/test_backbones/test_bisenetv1.py b/tests/test_models/test_backbones/test_bisenetv1.py new file mode 100644 index 0000000000..8e1571d6fb --- /dev/null +++ b/tests/test_models/test_backbones/test_bisenetv1.py @@ -0,0 +1,109 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import pytest +import torch + +from mmseg.models.backbones import BiSeNetV1 +from mmseg.models.backbones.bisenetv1 import (AttentionRefinementModule, + ContextPath, FeatureFusionModule, + SpatialPath) + + +def test_bisenetv1_backbone(): + # Test BiSeNetV1 Standard Forward + backbone_cfg = dict( + type='ResNet', + in_channels=3, + depth=18, + num_stages=4, + out_indices=(0, 1, 2, 3), + dilations=(1, 1, 1, 1), + strides=(1, 2, 2, 2), + norm_eval=False, + style='pytorch', + contract_dilation=True) + model = BiSeNetV1(in_channels=3, backbone_cfg=backbone_cfg) + model.init_weights() + model.train() + batch_size = 2 + imgs = torch.randn(batch_size, 3, 256, 512) + feat = model(imgs) + + assert len(feat) == 3 + # output for segment Head + assert feat[0].shape == torch.Size([batch_size, 256, 32, 64]) + # for auxiliary head 1 + assert feat[1].shape == torch.Size([batch_size, 128, 32, 64]) + # for auxiliary head 2 + assert feat[2].shape == torch.Size([batch_size, 128, 16, 32]) + + # Test input with rare shape + batch_size = 2 + imgs = torch.randn(batch_size, 3, 527, 279) + feat = model(imgs) + assert len(feat) == 3 + + with pytest.raises(AssertionError): + # BiSeNetV1 spatial path channel constraints. + BiSeNetV1( + backbone_cfg=backbone_cfg, + in_channels=3, + spatial_channels=(64, 64, 64)) + + with pytest.raises(AssertionError): + # BiSeNetV1 context path constraints. + BiSeNetV1( + backbone_cfg=backbone_cfg, + in_channels=3, + context_channels=(128, 256, 512, 1024)) + + +def test_bisenetv1_spatial_path(): + with pytest.raises(AssertionError): + # BiSeNetV1 spatial path channel constraints. + SpatialPath(num_channels=(64, 64, 64), in_channels=3) + + +def test_bisenetv1_context_path(): + backbone_cfg = dict( + type='ResNet', + in_channels=3, + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + dilations=(1, 1, 1, 1), + strides=(1, 2, 2, 2), + norm_eval=False, + style='pytorch', + contract_dilation=True) + + with pytest.raises(AssertionError): + # BiSeNetV1 context path constraints. + ContextPath( + backbone_cfg=backbone_cfg, context_channels=(128, 256, 512, 1024)) + + +def test_bisenetv1_attention_refinement_module(): + x_arm = AttentionRefinementModule(256, 64) + assert x_arm.conv_layer.in_channels == 256 + assert x_arm.conv_layer.out_channels == 64 + assert x_arm.conv_layer.kernel_size == (3, 3) + x = torch.randn(2, 256, 32, 64) + x_out = x_arm(x) + assert x_out.shape == torch.Size([2, 64, 32, 64]) + + +def test_bisenetv1_feature_fusion_module(): + ffm = FeatureFusionModule(128, 256) + assert ffm.conv1.in_channels == 128 + assert ffm.conv1.out_channels == 256 + assert ffm.conv1.kernel_size == (1, 1) + assert ffm.gap.output_size == (1, 1) + assert ffm.conv_atten[0].in_channels == 256 + assert ffm.conv_atten[0].out_channels == 256 + assert ffm.conv_atten[0].kernel_size == (1, 1) + + ffm = FeatureFusionModule(128, 128) + x1 = torch.randn(2, 64, 64, 128) + x2 = torch.randn(2, 64, 64, 128) + x_out = ffm(x1, x2) + assert x_out.shape == torch.Size([2, 128, 64, 128])