[Feature] Support BiSeNetV1 (open-mmlab#851)

* First Commit * fix typos * fix typos * Fix assertion bug * Adding Assert * Adding Unittest * Fixing typo * Uploading models & logs * Fixing unittest error * changing README.md * changing README.md
bowenroom · Sep 28, 2021 · e701497 · e701497
1 parent 2800d43
commit e701497
Show file tree

Hide file tree

Showing 14 changed files with 767 additions and 1 deletion.
diff --git a/README.md b/README.md
@@ -75,6 +75,7 @@ Supported methods:
 - [x] [PSPNet (CVPR'2017)](configs/pspnet)
 - [x] [DeepLabV3 (ArXiv'2017)](configs/deeplabv3)
 - [x] [Mixed Precision (FP16) Training (ArXiv'2017)](configs/fp16)
+- [x] [BiSeNetV1 (ECCV'2018)](configs/bisenetv1)
 - [x] [PSANet (ECCV'2018)](configs/psanet)
 - [x] [DeepLabV3+ (CVPR'2018)](configs/deeplabv3plus)
 - [x] [UPerNet (ECCV'2018)](configs/upernet)

diff --git a/README_zh-CN.md b/README_zh-CN.md
@@ -74,6 +74,7 @@ MMSegmentation 是一个基于 PyTorch 的语义分割开源工具箱。它是 O
 - [x] [PSPNet (CVPR'2017)](configs/pspnet)
 - [x] [DeepLabV3 (ArXiv'2017)](configs/deeplabv3)
 - [x] [Mixed Precision (FP16) Training (ArXiv'2017)](configs/fp16)
+- [x] [BiSeNetV1 (ECCV'2018)](configs/bisenetv1)
 - [x] [PSANet (ECCV'2018)](configs/psanet)
 - [x] [DeepLabV3+ (CVPR'2018)](configs/deeplabv3plus)
 - [x] [UPerNet (ECCV'2018)](configs/upernet)

diff --git a/configs/_base_/models/bisenetv1_r18-d32.py b/configs/_base_/models/bisenetv1_r18-d32.py
@@ -0,0 +1,68 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    type='EncoderDecoder',
+    backbone=dict(
+        type='BiSeNetV1',
+        in_channels=3,
+        context_channels=(128, 256, 512),
+        spatial_channels=(64, 64, 64, 128),
+        out_indices=(0, 1, 2),
+        out_channels=256,
+        backbone_cfg=dict(
+            type='ResNet',
+            in_channels=3,
+            depth=18,
+            num_stages=4,
+            out_indices=(0, 1, 2, 3),
+            dilations=(1, 1, 1, 1),
+            strides=(1, 2, 2, 2),
+            norm_cfg=norm_cfg,
+            norm_eval=False,
+            style='pytorch',
+            contract_dilation=True),
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        init_cfg=None),
+    decode_head=dict(
+        type='FCNHead',
+        in_channels=256,
+        in_index=0,
+        channels=256,
+        num_convs=1,
+        concat_input=False,
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    auxiliary_head=[
+        dict(
+            type='FCNHead',
+            in_channels=128,
+            channels=64,
+            num_convs=1,
+            num_classes=19,
+            in_index=1,
+            norm_cfg=norm_cfg,
+            concat_input=False,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+        dict(
+            type='FCNHead',
+            in_channels=128,
+            channels=64,
+            num_convs=1,
+            num_classes=19,
+            in_index=2,
+            norm_cfg=norm_cfg,
+            concat_input=False,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    ],
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
diff --git a/configs/bisenetv1/README.md b/configs/bisenetv1/README.md
@@ -0,0 +1,42 @@
+# BiSeNet: Bilateral Segmentation Network for Real-time Semantic Segmentation
+
+## Introduction
+
+<!-- [ALGORITHM] -->
+
+<a href="https://github.com/ycszen/TorchSeg/tree/master/model/bisenet">Official Repo</a>
+
+<a href="https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/backbones/bisenetv1.py#L266">Code Snippet</a>
+
+<details>
+<summary align="right"><a href="https://arxiv.org/abs/1808.00897">BiSeNetV1 (ECCV'2018)</a></summary>
+
+```latex
+@inproceedings{yu2018bisenet,
+  title={Bisenet: Bilateral segmentation network for real-time semantic segmentation},
+  author={Yu, Changqian and Wang, Jingbo and Peng, Chao and Gao, Changxin and Yu, Gang and Sang, Nong},
+  booktitle={Proceedings of the European conference on computer vision (ECCV)},
+  pages={325--341},
+  year={2018}
+}
+```
+
+</details>
+
+## Results and models
+
+### Cityscapes
+
+| Method    | Backbone  | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                  | download                                                                                                                                                                                                                                                       |
+| --------- | --------- | --------- | ------: | -------- | -------------- | ----: | ------------- | --------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| BiSeNetV1 (No Pretrain) | R-18-D32 | 1024x1024  | 160000 | 5.69 | 31.77 | 74.44 | 77.05 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes_20210922_172239-c55e78e2.pth) &#124; [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes_20210922_172239.log.json) |
+| BiSeNetV1| R-18-D32 | 1024x1024  | 160000 | 5.69 | 31.77 | 74.37 | 76.91 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210905_220251-8ba80eff.pth) &#124; [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210905_220251.log.json) |
+| BiSeNetV1 (4x8) | R-18-D32 | 1024x1024  | 160000 | 11.17 | 31.77 | 75.16 | 77.24 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes_20210905_220322-bb8db75f.pth) &#124; [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes_20210905_220322.log.json) |
+| BiSeNetV1 (No Pretrain) | R-50-D32 | 1024x1024  | 160000 | 3.3 | 7.71 | 76.92 | 78.87 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes_20210923_222639-7b28a2a6.pth) &#124; [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes_20210923_222639.log.json) |
+| BiSeNetV1 | R-50-D32 | 1024x1024  | 160000 | 15.39 | 7.71 | 77.68 | 79.57 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210917_234628-8b304447.pth) &#124; [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210917_234628.log.json) |
+
+Note:
+
+- `4x8`: Using 4 GPUs with 8 samples per GPU in training.
+- Default setting is 4 GPUs with 4 samples per GPU in training.
+- `No Pretrain` means the model is trained from scratch.
diff --git a/configs/bisenetv1/bisenetv1.yml b/configs/bisenetv1/bisenetv1.yml
@@ -0,0 +1,125 @@
+Collections:
+- Name: bisenetv1
+  Metadata:
+    Training Data:
+    - Cityscapes
+  Paper:
+    URL: https://arxiv.org/abs/1808.00897
+    Title: 'BiSeNet: Bilateral Segmentation Network for Real-time Semantic Segmentation'
+  README: configs/bisenetv1/README.md
+  Code:
+    URL: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/backbones/bisenetv1.py#L266
+    Version: v0.18.0
+  Converted From:
+    Code: https://github.com/ycszen/TorchSeg/tree/master/model/bisenet
+Models:
+- Name: bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes
+  In Collection: bisenetv1
+  Metadata:
+    backbone: R-18-D32
+    crop size: (1024,1024)
+    lr schd: 160000
+    inference time (ms/im):
+    - value: 31.48
+      hardware: V100
+      backend: PyTorch
+      batch size: 1
+      mode: FP32
+      resolution: (1024,1024)
+    memory (GB): 5.69
+  Results:
+  - Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 74.44
+      mIoU(ms+flip): 77.05
+  Config: configs/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes.py
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes_20210922_172239-c55e78e2.pth
+- Name: bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes
+  In Collection: bisenetv1
+  Metadata:
+    backbone: R-18-D32
+    crop size: (1024,1024)
+    lr schd: 160000
+    inference time (ms/im):
+    - value: 31.48
+      hardware: V100
+      backend: PyTorch
+      batch size: 1
+      mode: FP32
+      resolution: (1024,1024)
+    memory (GB): 5.69
+  Results:
+  - Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 74.37
+      mIoU(ms+flip): 76.91
+  Config: configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210905_220251-8ba80eff.pth
+- Name: bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes
+  In Collection: bisenetv1
+  Metadata:
+    backbone: R-18-D32
+    crop size: (1024,1024)
+    lr schd: 160000
+    inference time (ms/im):
+    - value: 31.48
+      hardware: V100
+      backend: PyTorch
+      batch size: 1
+      mode: FP32
+      resolution: (1024,1024)
+    memory (GB): 11.17
+  Results:
+  - Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 75.16
+      mIoU(ms+flip): 77.24
+  Config: configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes.py
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes_20210905_220322-bb8db75f.pth
+- Name: bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes
+  In Collection: bisenetv1
+  Metadata:
+    backbone: R-50-D32
+    crop size: (1024,1024)
+    lr schd: 160000
+    inference time (ms/im):
+    - value: 129.7
+      hardware: V100
+      backend: PyTorch
+      batch size: 1
+      mode: FP32
+      resolution: (1024,1024)
+    memory (GB): 3.3
+  Results:
+  - Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 76.92
+      mIoU(ms+flip): 78.87
+  Config: configs/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes.py
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes_20210923_222639-7b28a2a6.pth
+- Name: bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes
+  In Collection: bisenetv1
+  Metadata:
+    backbone: R-50-D32
+    crop size: (1024,1024)
+    lr schd: 160000
+    inference time (ms/im):
+    - value: 129.7
+      hardware: V100
+      backend: PyTorch
+      batch size: 1
+      mode: FP32
+      resolution: (1024,1024)
+    memory (GB): 15.39
+  Results:
+  - Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 77.68
+      mIoU(ms+flip): 79.57
+  Config: configs/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210917_234628-8b304447.pth
diff --git a/configs/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes.py b/configs/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes.py
@@ -0,0 +1,11 @@
+_base_ = [
+    '../_base_/models/bisenetv1_r18-d32.py',
+    '../_base_/datasets/cityscapes_1024x1024.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+lr_config = dict(warmup='linear', warmup_iters=1000)
+optimizer = dict(lr=0.025)
+data = dict(
+    samples_per_gpu=4,
+    workers_per_gpu=4,
+)
diff --git a/configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py b/configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py
@@ -0,0 +1,16 @@
+_base_ = [
+    '../_base_/models/bisenetv1_r18-d32.py',
+    '../_base_/datasets/cityscapes_1024x1024.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+model = dict(
+    backbone=dict(
+        backbone_cfg=dict(
+            init_cfg=dict(
+                type='Pretrained', checkpoint='open-mmlab://resnet18_v1c'))))
+lr_config = dict(warmup='linear', warmup_iters=1000)
+optimizer = dict(lr=0.025)
+data = dict(
+    samples_per_gpu=4,
+    workers_per_gpu=4,
+)
diff --git a/configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes.py b/configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes.py
@@ -0,0 +1,5 @@
+_base_ = './bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py'
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=8,
+)
diff --git a/configs/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes.py b/configs/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes.py
@@ -0,0 +1,46 @@
+_base_ = [
+    '../_base_/models/bisenetv1_r18-d32.py',
+    '../_base_/datasets/cityscapes_1024x1024.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    type='EncoderDecoder',
+    backbone=dict(
+        type='BiSeNetV1',
+        context_channels=(512, 1024, 2048),
+        spatial_channels=(256, 256, 256, 512),
+        out_channels=1024,
+        backbone_cfg=dict(
+            init_cfg=dict(
+                type='Pretrained', checkpoint='open-mmlab://resnet50_v1c'),
+            type='ResNet',
+            depth=50)),
+    decode_head=dict(
+        type='FCNHead', in_channels=1024, in_index=0, channels=1024),
+    auxiliary_head=[
+        dict(
+            type='FCNHead',
+            in_channels=512,
+            channels=256,
+            num_convs=1,
+            num_classes=19,
+            in_index=1,
+            norm_cfg=norm_cfg,
+            concat_input=False),
+        dict(
+            type='FCNHead',
+            in_channels=512,
+            channels=256,
+            num_convs=1,
+            num_classes=19,
+            in_index=2,
+            norm_cfg=norm_cfg,
+            concat_input=False),
+    ])
+lr_config = dict(warmup='linear', warmup_iters=1000)
+optimizer = dict(lr=0.05)
+data = dict(
+    samples_per_gpu=4,
+    workers_per_gpu=4,
+)
diff --git a/configs/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py b/configs/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py
@@ -0,0 +1,7 @@
+_base_ = './bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes.py'
+model = dict(
+    type='EncoderDecoder',
+    backbone=dict(
+        backbone_cfg=dict(
+            init_cfg=dict(
+                type='Pretrained', checkpoint='open-mmlab://resnet50_v1c'))))
diff --git a/mmseg/models/backbones/__init__.py b/mmseg/models/backbones/__init__.py
@@ -1,4 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from .bisenetv1 import BiSeNetV1
 from .bisenetv2 import BiSeNetV2
 from .cgnet import CGNet
 from .fast_scnn import FastSCNN
@@ -16,5 +17,6 @@
 __all__ = [
     'ResNet', 'ResNetV1c', 'ResNetV1d', 'ResNeXt', 'HRNet', 'FastSCNN',
     'ResNeSt', 'MobileNetV2', 'UNet', 'CGNet', 'MobileNetV3',
-    'VisionTransformer', 'SwinTransformer', 'MixVisionTransformer', 'BiSeNetV2'
+    'VisionTransformer', 'SwinTransformer', 'MixVisionTransformer',
+    'BiSeNetV1', 'BiSeNetV2'
 ]