From a44c172e21a105761f4af02eab0ac55b678c3839 Mon Sep 17 00:00:00 2001
From: MengzhangLI <mcmong@pku.edu.cn>
Date: Fri, 3 Sep 2021 15:34:58 +0800
Subject: [PATCH 01/11] First Commit

---
 .../_base_/datasets/cityscapes_1024x1024.py   |  35 ++
 configs/_base_/models/bisenetv1_r18-d32.py    |  67 ++++
 ...1_r18-d32_4x4_1024x1024_160k_cityscapes.py |  11 +
 mmseg/models/backbones/__init__.py            |   3 +-
 mmseg/models/backbones/bisenetv1.py           | 302 ++++++++++++++++++
 .../test_backbones/test_bisenetv1.py          |  39 +++
 6 files changed, 456 insertions(+), 1 deletion(-)
 create mode 100644 configs/_base_/datasets/cityscapes_1024x1024.py
 create mode 100644 configs/_base_/models/bisenetv1_r18-d32.py
 create mode 100644 configs/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes.py
 create mode 100644 mmseg/models/backbones/bisenetv1.py
 create mode 100644 tests/test_models/test_backbones/test_bisenetv1.py

diff --git a/configs/_base_/datasets/cityscapes_1024x1024.py b/configs/_base_/datasets/cityscapes_1024x1024.py
new file mode 100644
index 0000000000..f98d929723
--- /dev/null
+++ b/configs/_base_/datasets/cityscapes_1024x1024.py
@@ -0,0 +1,35 @@
+_base_ = './cityscapes.py'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+crop_size = (1024, 1024)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(type='Resize', img_scale=(2048, 1024), ratio_range=(0.5, 2.0)),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_semantic_seg']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(2048, 1024),
+        # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))
diff --git a/configs/_base_/models/bisenetv1_r18-d32.py b/configs/_base_/models/bisenetv1_r18-d32.py
new file mode 100644
index 0000000000..e9a62def60
--- /dev/null
+++ b/configs/_base_/models/bisenetv1_r18-d32.py
@@ -0,0 +1,67 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    type='EncoderDecoder',
+    backbone=dict(
+        type='BiSeNetV1',
+        in_channel=3,
+        context_channels=(128, 256, 512),
+        spatial_channels=(64, 64, 64, 128),
+        out_indices=(0, 1, 2),
+        backbone_cfg=dict(
+            type='ResNet',
+            in_channels=3,
+            depth=18,
+            num_stages=4,
+            out_indices=(0, 1, 2, 3),
+            dilations=(1, 1, 1, 1),
+            strides=(1, 2, 2, 2),
+            norm_cfg=norm_cfg,
+            norm_eval=False,
+            style='pytorch',
+            contract_dilation=True),
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        init_cfg=None),
+    decode_head=dict(
+        type='FCNHead',
+        in_channels=256,
+        in_index=0,
+        channels=256,
+        num_convs=1,
+        concat_input=False,
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    auxiliary_head=[
+        dict(
+            type='FCNHead',
+            in_channels=128,
+            channels=64,
+            num_convs=1,
+            num_classes=19,
+            in_index=1,
+            norm_cfg=norm_cfg,
+            concat_input=False,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+        dict(
+            type='FCNHead',
+            in_channels=128,
+            channels=64,
+            num_convs=1,
+            num_classes=19,
+            in_index=2,
+            norm_cfg=norm_cfg,
+            concat_input=False,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    ],
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
diff --git a/configs/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes.py b/configs/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes.py
new file mode 100644
index 0000000000..f4019e930e
--- /dev/null
+++ b/configs/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes.py
@@ -0,0 +1,11 @@
+_base_ = [
+    '../_base_/models/bisenetv1_r18-d32.py',
+    '../_base_/datasets/cityscapes_1024x1024.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+lr_config = dict(warmup='linear', warmup_iters=1000)
+optimizer = dict(lr=0.025)
+data = dict(
+    samples_per_gpu=4,
+    workers_per_gpu=4,
+)
diff --git a/mmseg/models/backbones/__init__.py b/mmseg/models/backbones/__init__.py
index 75ef2c3a86..5e4373291d 100644
--- a/mmseg/models/backbones/__init__.py
+++ b/mmseg/models/backbones/__init__.py
@@ -1,4 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from .bisenetv1 import BiSeNetV1
 from .cgnet import CGNet
 from .fast_scnn import FastSCNN
 from .hrnet import HRNet
@@ -15,5 +16,5 @@
 __all__ = [
     'ResNet', 'ResNetV1c', 'ResNetV1d', 'ResNeXt', 'HRNet', 'FastSCNN',
     'ResNeSt', 'MobileNetV2', 'UNet', 'CGNet', 'MobileNetV3',
-    'VisionTransformer', 'SwinTransformer', 'MixVisionTransformer'
+    'VisionTransformer', 'SwinTransformer', 'MixVisionTransformer', 'BiSeNetV1'
 ]
diff --git a/mmseg/models/backbones/bisenetv1.py b/mmseg/models/backbones/bisenetv1.py
new file mode 100644
index 0000000000..20077d387b
--- /dev/null
+++ b/mmseg/models/backbones/bisenetv1.py
@@ -0,0 +1,302 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmcv.runner import BaseModule
+
+from mmseg.ops import resize
+from ..builder import BACKBONES, build_backbone
+
+
+class SpatialPath(BaseModule):
+    """Spatial Path to preserve the spatial size of the original input image
+    and encode affluent spatial information.
+
+    Args:
+        spatial_channels (Tuple[int]): Size of channel numbers of
+            various layers in Spatial Path.
+            Default: (64, 64, 64, 128).
+        in_channel(int): Channel of input image. Default: 3.
+    Returns:
+        x (torch.Tensor): Feature map for Feature Fusion Module.
+    """
+
+    def __init__(self,
+                 spatial_channels=(64, 64, 64, 128),
+                 in_channel=3,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 init_cfg=None):
+        super(SpatialPath, self).__init__(init_cfg=init_cfg)
+        self.layer_stages = []
+        for i in range(len(spatial_channels)):
+            layer_name = f'layer{i + 1}'
+            self.layer_stages.append(layer_name)
+            if i == 0:
+                self.add_module(
+                    layer_name,
+                    ConvModule(
+                        in_channels=in_channel,
+                        out_channels=spatial_channels[i],
+                        kernel_size=7,
+                        stride=2,
+                        padding=3,
+                        conv_cfg=conv_cfg,
+                        norm_cfg=norm_cfg,
+                        act_cfg=act_cfg))
+            elif i != len(spatial_channels) - 1:
+                self.add_module(
+                    layer_name,
+                    ConvModule(
+                        in_channels=spatial_channels[i - 1],
+                        out_channels=spatial_channels[i],
+                        kernel_size=3,
+                        stride=2,
+                        padding=1,
+                        conv_cfg=conv_cfg,
+                        norm_cfg=norm_cfg,
+                        act_cfg=act_cfg))
+            else:
+                self.add_module(
+                    layer_name,
+                    ConvModule(
+                        in_channels=spatial_channels[i - 1],
+                        out_channels=spatial_channels[i],
+                        kernel_size=1,
+                        stride=1,
+                        padding=0,
+                        conv_cfg=conv_cfg,
+                        norm_cfg=norm_cfg,
+                        act_cfg=act_cfg))
+
+    def forward(self, x):
+        for i, layer_name in enumerate(self.layer_stages):
+            layer_stage = getattr(self, layer_name)
+            x = layer_stage(x)
+        return x
+
+
+class AttentionRefinementModule(BaseModule):
+    """Attention Refinement Module (ARM) to refine the features of each stage.
+
+    Args:
+        in_channel (int): Number of input channels.
+        out_channels (int): Number of output channels.
+    Returns:
+        x_out (torch.Tensor): Feature map of Attention Refinement Module.
+    """
+
+    def __init__(self,
+                 in_channel,
+                 out_channel,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 init_cfg=None):
+        super(AttentionRefinementModule, self).__init__(init_cfg=init_cfg)
+        self.conv_layer = ConvModule(
+            in_channels=in_channel,
+            out_channels=out_channel,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.atten_conv_layer = nn.Sequential(
+            nn.AdaptiveAvgPool2d((1, 1)),
+            ConvModule(
+                in_channels=out_channel,
+                out_channels=out_channel,
+                kernel_size=1,
+                bias=False,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=None), nn.Sigmoid())
+
+    def forward(self, x):
+        x = self.conv_layer(x)
+        x_atten = self.atten_conv_layer(x)
+        x_out = torch.mul(x, x_atten)
+        return x_out
+
+
+class ContextPath(BaseModule):
+    """Context Path to provide sufficient receptive field.
+
+    Args:
+        backbone_cfg:(dict | None): Config of backbone of
+            Context Path.
+        context_channels (Tuple[int]): Size of channel numbers of
+            various modules in Context Path.
+            Default: (128, 256, 512).
+        align_corners (bool, optional): The align_corners argument of
+            resize operation. Default: False.
+    Returns:
+        [x_16_up, x_32_up] (List[torch.Tensor]): List of two feature
+            maps for Feature Fusion Module and Auxiliary Head.
+    """
+
+    def __init__(self,
+                 backbone_cfg,
+                 context_channels=(128, 256, 512),
+                 align_corners=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 init_cfg=None):
+        super(ContextPath, self).__init__(init_cfg=init_cfg)
+        self.backbone = build_backbone(backbone_cfg)
+
+        self.align_corners = align_corners
+        self.arm16 = AttentionRefinementModule(context_channels[1],
+                                               context_channels[0])
+        self.arm32 = AttentionRefinementModule(context_channels[2],
+                                               context_channels[0])
+        self.conv_head32 = ConvModule(
+            in_channels=context_channels[0],
+            out_channels=context_channels[0],
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.conv_head16 = ConvModule(
+            in_channels=context_channels[0],
+            out_channels=context_channels[0],
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.gap_conv = nn.Sequential(
+            nn.AdaptiveAvgPool2d((1, 1)),
+            ConvModule(
+                in_channels=context_channels[2],
+                out_channels=context_channels[0],
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg))
+
+    def forward(self, x):
+        x_4, x_8, x_16, x_32 = self.backbone(x)
+        x_gap = self.gap_conv(x_32)
+
+        x_32_arm = self.arm32(x_32)
+        x_32_sum = x_32_arm + x_gap
+        x_32_up = resize(input=x_32_sum, size=x_16.shape[2:], mode='nearest')
+        x_32_up = self.conv_head32(x_32_up)
+
+        x_16_arm = self.arm16(x_16)
+        x_16_sum = x_16_arm + x_32_up
+        x_16_up = resize(input=x_16_sum, size=x_8.shape[2:], mode='nearest')
+        x_16_up = self.conv_head16(x_16_up)
+
+        return [x_16_up, x_32_up]
+
+
+class FeatureFusionModule(BaseModule):
+    """Feature Fusion Module to fuse low level output feature of Spatial Path
+    and high level output feature of Context Path.
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+    Returns:
+        x_out (torch.Tensor): Feature map of Feature Fusion Module.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='ReLU'),
+                 init_cfg=None):
+        super(FeatureFusionModule, self).__init__(init_cfg=init_cfg)
+        self.conv1 = ConvModule(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.gap = nn.AdaptiveAvgPool2d((1, 1))
+        #  use conv-bn instead of 2 layer mlp,
+        #  so that tensorrt 7.2.3.4 can work for fp16
+        self.conv_atten = nn.Sequential(
+            ConvModule(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                bias=False,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg), nn.Sigmoid())
+
+    def forward(self, x_sp, x_cp):
+        x_concat = torch.cat([x_sp, x_cp], dim=1)
+        x_fuse = self.conv1(x_concat)
+        x_atten = self.gap(x_fuse)
+        # TODO: No BN and more 1x1 conv in paper.
+        x_atten = self.conv_atten(x_atten)
+        x_atten = torch.mul(x_fuse, x_atten)
+        x_out = x_atten + x_fuse
+        return x_out
+
+
+@BACKBONES.register_module()
+class BiSeNetV1(BaseModule):
+    """BiSeNetV1 backbone.
+
+    This backbone is the implementation of `BiSeNet: Bilateral
+    Segmentation Network for Real-time Semantic
+    Segmentation <https://arxiv.org/abs/1808.00897>`_.
+
+    Args:
+        align_corners (bool, optional): The align_corners argument of
+            resize operation in Bilateral Guided Aggregation Layer.
+            Default: False.
+    """
+
+    def __init__(self,
+                 in_channel=3,
+                 backbone_cfg=None,
+                 spatial_channels=(64, 64, 64, 128),
+                 context_channels=(128, 256, 512),
+                 out_indices=(0, 1, 2),
+                 align_corners=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 act_cfg=dict(type='ReLU'),
+                 init_cfg=None):
+
+        super(BiSeNetV1, self).__init__(init_cfg=init_cfg)
+        self.out_indices = out_indices
+        self.align_corners = align_corners
+        self.context_path = ContextPath(backbone_cfg, context_channels,
+                                        self.align_corners)
+        self.spatial_path = SpatialPath(spatial_channels, in_channel)
+        self.ffm = FeatureFusionModule(256, 256)
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+
+    def forward(self, x):
+        x_context8, x_context16 = self.context_path(x)
+        x_spatial = self.spatial_path(x)
+        x_fuse = self.ffm(x_spatial, x_context8)
+
+        outs = [x_fuse] + [x_context8, x_context16]
+        outs = [outs[i] for i in self.out_indices]
+        return tuple(outs)
diff --git a/tests/test_models/test_backbones/test_bisenetv1.py b/tests/test_models/test_backbones/test_bisenetv1.py
new file mode 100644
index 0000000000..9bcf9e6d73
--- /dev/null
+++ b/tests/test_models/test_backbones/test_bisenetv1.py
@@ -0,0 +1,39 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from mmseg.models.backbones import BiSeNetV1
+
+
+def test_bisenetv2_backbone():
+    # Test BiSeNetV1 Standard Forward
+    backbone_cfg = dict(
+        type='ResNet',
+        in_channels=3,
+        depth=18,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        dilations=(1, 1, 1, 1),
+        strides=(1, 2, 2, 2),
+        norm_eval=False,
+        style='pytorch',
+        contract_dilation=True)
+    model = BiSeNetV1(in_channel=3, backbone_cfg=backbone_cfg)
+    model.init_weights()
+    model.train()
+    batch_size = 2
+    imgs = torch.randn(batch_size, 3, 512, 1024)
+    feat = model(imgs)
+
+    assert len(feat) == 3
+    # output for segment Head
+    assert feat[0].shape == torch.Size([batch_size, 256, 64, 128])
+    # for auxiliary head 1
+    assert feat[1].shape == torch.Size([batch_size, 128, 64, 128])
+    # for auxiliary head 2
+    assert feat[2].shape == torch.Size([batch_size, 128, 32, 64])
+
+    # Test input with rare shape
+    batch_size = 2
+    imgs = torch.randn(batch_size, 3, 952, 527)
+    feat = model(imgs)
+    assert len(feat) == 3

From dfdb981124ac8de8eb75e097c774bb039e62b6f2 Mon Sep 17 00:00:00 2001
From: MengzhangLI <mcmong@pku.edu.cn>
Date: Fri, 3 Sep 2021 17:53:22 +0800
Subject: [PATCH 02/11] fix typos

---
 mmseg/models/backbones/bisenetv1.py                | 11 +++++++++++
 tests/test_models/test_backbones/test_bisenetv1.py |  2 +-
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/mmseg/models/backbones/bisenetv1.py b/mmseg/models/backbones/bisenetv1.py
index 20077d387b..9ed586e141 100644
--- a/mmseg/models/backbones/bisenetv1.py
+++ b/mmseg/models/backbones/bisenetv1.py
@@ -264,6 +264,17 @@ class BiSeNetV1(BaseModule):
     Segmentation <https://arxiv.org/abs/1808.00897>`_.
 
     Args:
+        in_channel(int): Channel of input image. Default: 3.
+        backbone_cfg:(dict | None): Config of backbone of
+            Context Path.
+        spatial_channels (Tuple[int]): Size of channel numbers of
+            various layers in Spatial Path.
+            Default: (64, 64, 64, 128).
+        context_channels (Tuple[int]): Size of channel numbers of
+            various modules in Context Path.
+            Default: (128, 256, 512).
+        out_indices (Tuple[int] | int, optional): Output from which stages.
+            Default: (0, 1, 2).
         align_corners (bool, optional): The align_corners argument of
             resize operation in Bilateral Guided Aggregation Layer.
             Default: False.
diff --git a/tests/test_models/test_backbones/test_bisenetv1.py b/tests/test_models/test_backbones/test_bisenetv1.py
index 9bcf9e6d73..e6ff2f90c8 100644
--- a/tests/test_models/test_backbones/test_bisenetv1.py
+++ b/tests/test_models/test_backbones/test_bisenetv1.py
@@ -4,7 +4,7 @@
 from mmseg.models.backbones import BiSeNetV1
 
 
-def test_bisenetv2_backbone():
+def test_bisenetv1_backbone():
     # Test BiSeNetV1 Standard Forward
     backbone_cfg = dict(
         type='ResNet',

From 3d241e0135d88ec47631e6ac55d6c9472414f57d Mon Sep 17 00:00:00 2001
From: MengzhangLI <mcmong@pku.edu.cn>
Date: Fri, 3 Sep 2021 20:01:15 +0800
Subject: [PATCH 03/11] fix typos

---
 configs/_base_/models/bisenetv1_r18-d32.py    |   3 +-
 mmseg/models/backbones/bisenetv1.py           | 103 ++++++++++--------
 .../test_backbones/test_bisenetv1.py          |  17 ++-
 3 files changed, 76 insertions(+), 47 deletions(-)

diff --git a/configs/_base_/models/bisenetv1_r18-d32.py b/configs/_base_/models/bisenetv1_r18-d32.py
index e9a62def60..40698644ba 100644
--- a/configs/_base_/models/bisenetv1_r18-d32.py
+++ b/configs/_base_/models/bisenetv1_r18-d32.py
@@ -4,10 +4,11 @@
     type='EncoderDecoder',
     backbone=dict(
         type='BiSeNetV1',
-        in_channel=3,
+        in_channels=3,
         context_channels=(128, 256, 512),
         spatial_channels=(64, 64, 64, 128),
         out_indices=(0, 1, 2),
+        out_channels=256,
         backbone_cfg=dict(
             type='ResNet',
             in_channels=3,
diff --git a/mmseg/models/backbones/bisenetv1.py b/mmseg/models/backbones/bisenetv1.py
index 9ed586e141..2776c6aaed 100644
--- a/mmseg/models/backbones/bisenetv1.py
+++ b/mmseg/models/backbones/bisenetv1.py
@@ -13,47 +13,48 @@ class SpatialPath(BaseModule):
     and encode affluent spatial information.
 
     Args:
-        spatial_channels (Tuple[int]): Size of channel numbers of
-            various layers in Spatial Path.
+        num_channels (Tuple[int]): The number of channels of
+            each layers in Spatial Path.
             Default: (64, 64, 64, 128).
-        in_channel(int): Channel of input image. Default: 3.
+        in_channels(int): The number of channels of input
+            image. Default: 3.
     Returns:
         x (torch.Tensor): Feature map for Feature Fusion Module.
     """
 
     def __init__(self,
-                 spatial_channels=(64, 64, 64, 128),
-                 in_channel=3,
+                 num_channels=(64, 64, 64, 128),
+                 in_channels=3,
                  conv_cfg=None,
                  norm_cfg=dict(type='BN'),
                  act_cfg=dict(type='ReLU'),
                  init_cfg=None):
         super(SpatialPath, self).__init__(init_cfg=init_cfg)
-        self.layer_stages = []
-        for i in range(len(spatial_channels)):
+        self.layers = []
+        for i in range(len(num_channels)):
             layer_name = f'layer{i + 1}'
-            self.layer_stages.append(layer_name)
+            self.layers.append(layer_name)
             if i == 0:
                 self.add_module(
                     layer_name,
                     ConvModule(
-                        in_channels=in_channel,
-                        out_channels=spatial_channels[i],
+                        in_channels=in_channels,
+                        out_channels=num_channels[i],
                         kernel_size=7,
                         stride=2,
                         padding=3,
                         conv_cfg=conv_cfg,
                         norm_cfg=norm_cfg,
                         act_cfg=act_cfg))
-            elif i != len(spatial_channels) - 1:
+            elif i == len(num_channels) - 1:
                 self.add_module(
                     layer_name,
                     ConvModule(
-                        in_channels=spatial_channels[i - 1],
-                        out_channels=spatial_channels[i],
-                        kernel_size=3,
-                        stride=2,
-                        padding=1,
+                        in_channels=num_channels[i - 1],
+                        out_channels=num_channels[i],
+                        kernel_size=1,
+                        stride=1,
+                        padding=0,
                         conv_cfg=conv_cfg,
                         norm_cfg=norm_cfg,
                         act_cfg=act_cfg))
@@ -61,17 +62,17 @@ def __init__(self,
                 self.add_module(
                     layer_name,
                     ConvModule(
-                        in_channels=spatial_channels[i - 1],
-                        out_channels=spatial_channels[i],
-                        kernel_size=1,
-                        stride=1,
-                        padding=0,
+                        in_channels=num_channels[i - 1],
+                        out_channels=num_channels[i],
+                        kernel_size=3,
+                        stride=2,
+                        padding=1,
                         conv_cfg=conv_cfg,
                         norm_cfg=norm_cfg,
                         act_cfg=act_cfg))
 
     def forward(self, x):
-        for i, layer_name in enumerate(self.layer_stages):
+        for i, layer_name in enumerate(self.layers):
             layer_stage = getattr(self, layer_name)
             x = layer_stage(x)
         return x
@@ -81,14 +82,14 @@ class AttentionRefinementModule(BaseModule):
     """Attention Refinement Module (ARM) to refine the features of each stage.
 
     Args:
-        in_channel (int): Number of input channels.
-        out_channels (int): Number of output channels.
+        in_channels (int): The number of input channels.
+        out_channels (int): The number of output channels.
     Returns:
         x_out (torch.Tensor): Feature map of Attention Refinement Module.
     """
 
     def __init__(self,
-                 in_channel,
+                 in_channels,
                  out_channel,
                  conv_cfg=None,
                  norm_cfg=dict(type='BN'),
@@ -96,7 +97,7 @@ def __init__(self,
                  init_cfg=None):
         super(AttentionRefinementModule, self).__init__(init_cfg=init_cfg)
         self.conv_layer = ConvModule(
-            in_channels=in_channel,
+            in_channels=in_channels,
             out_channels=out_channel,
             kernel_size=3,
             stride=1,
@@ -118,7 +119,7 @@ def __init__(self,
     def forward(self, x):
         x = self.conv_layer(x)
         x_atten = self.atten_conv_layer(x)
-        x_out = torch.mul(x, x_atten)
+        x_out = x * x_atten
         return x_out
 
 
@@ -126,16 +127,18 @@ class ContextPath(BaseModule):
     """Context Path to provide sufficient receptive field.
 
     Args:
-        backbone_cfg:(dict | None): Config of backbone of
+        backbone_cfg:(dict): Config of backbone of
             Context Path.
-        context_channels (Tuple[int]): Size of channel numbers of
-            various modules in Context Path.
+        context_channels (Tuple[int]): The number of channel numbers
+            of various modules in Context Path.
             Default: (128, 256, 512).
         align_corners (bool, optional): The align_corners argument of
             resize operation. Default: False.
     Returns:
-        [x_16_up, x_32_up] (List[torch.Tensor]): List of two feature
-            maps for Feature Fusion Module and Auxiliary Head.
+        x_16_up, x_32_up (torch.Tensor, torch.Tensor): Two feature maps
+            undergoing upsampling from 1/16 and 1/32 downsampling
+            feature maps. These two feature maps are used for Feature
+            Fusion Module and Auxiliary Head.
     """
 
     def __init__(self,
@@ -198,7 +201,7 @@ def forward(self, x):
         x_16_up = resize(input=x_16_sum, size=x_8.shape[2:], mode='nearest')
         x_16_up = self.conv_head16(x_16_up)
 
-        return [x_16_up, x_32_up]
+        return x_16_up, x_32_up
 
 
 class FeatureFusionModule(BaseModule):
@@ -206,8 +209,8 @@ class FeatureFusionModule(BaseModule):
     and high level output feature of Context Path.
 
     Args:
-        in_channels (int): Number of input channels.
-        out_channels (int): Number of output channels.
+        in_channels (int): The number of input channels.
+        out_channels (int): The number of output channels.
     Returns:
         x_out (torch.Tensor): Feature map of Feature Fusion Module.
     """
@@ -230,8 +233,6 @@ def __init__(self,
             norm_cfg=norm_cfg,
             act_cfg=act_cfg)
         self.gap = nn.AdaptiveAvgPool2d((1, 1))
-        #  use conv-bn instead of 2 layer mlp,
-        #  so that tensorrt 7.2.3.4 can work for fp16
         self.conv_atten = nn.Sequential(
             ConvModule(
                 in_channels=out_channels,
@@ -250,7 +251,7 @@ def forward(self, x_sp, x_cp):
         x_atten = self.gap(x_fuse)
         # TODO: No BN and more 1x1 conv in paper.
         x_atten = self.conv_atten(x_atten)
-        x_atten = torch.mul(x_fuse, x_atten)
+        x_atten = x_fuse * x_atten
         x_out = x_atten + x_fuse
         return x_out
 
@@ -264,9 +265,10 @@ class BiSeNetV1(BaseModule):
     Segmentation <https://arxiv.org/abs/1808.00897>`_.
 
     Args:
-        in_channel(int): Channel of input image. Default: 3.
-        backbone_cfg:(dict | None): Config of backbone of
+        backbone_cfg:(dict): Config of backbone of
             Context Path.
+        in_channels(int): The number of channels of input
+            image. Default: 3.
         spatial_channels (Tuple[int]): Size of channel numbers of
             various layers in Spatial Path.
             Default: (64, 64, 64, 128).
@@ -278,36 +280,47 @@ class BiSeNetV1(BaseModule):
         align_corners (bool, optional): The align_corners argument of
             resize operation in Bilateral Guided Aggregation Layer.
             Default: False.
+        out_channels(int): The number of channels of output.
+            It must be the same with `in_channels` of decode_head.
+            Default: 256.
     """
 
     def __init__(self,
-                 in_channel=3,
-                 backbone_cfg=None,
+                 backbone_cfg,
+                 in_channels=3,
                  spatial_channels=(64, 64, 64, 128),
                  context_channels=(128, 256, 512),
                  out_indices=(0, 1, 2),
                  align_corners=False,
+                 out_channels=256,
                  conv_cfg=None,
                  norm_cfg=dict(type='BN', requires_grad=True),
                  act_cfg=dict(type='ReLU'),
                  init_cfg=None):
 
         super(BiSeNetV1, self).__init__(init_cfg=init_cfg)
+        if len(spatial_channels) != 4:
+            raise AssertionError('Length of input channels of Spatial \
+                                 Path must be 4!')
+        if len(context_channels) != 3:
+            raise AssertionError('Length of input channels of Context \
+                                 Path must be 3!')
         self.out_indices = out_indices
         self.align_corners = align_corners
         self.context_path = ContextPath(backbone_cfg, context_channels,
                                         self.align_corners)
-        self.spatial_path = SpatialPath(spatial_channels, in_channel)
-        self.ffm = FeatureFusionModule(256, 256)
+        self.spatial_path = SpatialPath(spatial_channels, in_channels)
+        self.ffm = FeatureFusionModule(context_channels[1], out_channels)
         self.conv_cfg = conv_cfg
         self.norm_cfg = norm_cfg
         self.act_cfg = act_cfg
 
     def forward(self, x):
+        # stole refactoring code from Coin Cheung, thanks
         x_context8, x_context16 = self.context_path(x)
         x_spatial = self.spatial_path(x)
         x_fuse = self.ffm(x_spatial, x_context8)
 
-        outs = [x_fuse] + [x_context8, x_context16]
+        outs = [x_fuse, x_context8, x_context16]
         outs = [outs[i] for i in self.out_indices]
         return tuple(outs)
diff --git a/tests/test_models/test_backbones/test_bisenetv1.py b/tests/test_models/test_backbones/test_bisenetv1.py
index e6ff2f90c8..591388e71a 100644
--- a/tests/test_models/test_backbones/test_bisenetv1.py
+++ b/tests/test_models/test_backbones/test_bisenetv1.py
@@ -1,4 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import pytest
 import torch
 
 from mmseg.models.backbones import BiSeNetV1
@@ -17,7 +18,7 @@ def test_bisenetv1_backbone():
         norm_eval=False,
         style='pytorch',
         contract_dilation=True)
-    model = BiSeNetV1(in_channel=3, backbone_cfg=backbone_cfg)
+    model = BiSeNetV1(in_channels=3, backbone_cfg=backbone_cfg)
     model.init_weights()
     model.train()
     batch_size = 2
@@ -37,3 +38,17 @@ def test_bisenetv1_backbone():
     imgs = torch.randn(batch_size, 3, 952, 527)
     feat = model(imgs)
     assert len(feat) == 3
+
+    with pytest.raises(AssertionError):
+        # BiSeNetV1 spatial path channel constraints.
+        BiSeNetV1(
+            backbone_cfg=backbone_cfg,
+            in_channels=3,
+            spatial_channels=(64, 64, 64))
+
+    with pytest.raises(AssertionError):
+        # BiSeNetV1 context path constraints.
+        BiSeNetV1(
+            backbone_cfg=backbone_cfg,
+            in_channels=3,
+            context_channels=(128, 256, 512, 1024))

From 4c504f8edacb90d395ba8c68657250775fa5a4ae Mon Sep 17 00:00:00 2001
From: MengzhangLI <mcmong@pku.edu.cn>
Date: Fri, 17 Sep 2021 21:17:09 +0800
Subject: [PATCH 04/11] Fix assertion bug

---
 mmseg/models/backbones/bisenetv1.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/mmseg/models/backbones/bisenetv1.py b/mmseg/models/backbones/bisenetv1.py
index 2776c6aaed..519eed46da 100644
--- a/mmseg/models/backbones/bisenetv1.py
+++ b/mmseg/models/backbones/bisenetv1.py
@@ -249,7 +249,7 @@ def forward(self, x_sp, x_cp):
         x_concat = torch.cat([x_sp, x_cp], dim=1)
         x_fuse = self.conv1(x_concat)
         x_atten = self.gap(x_fuse)
-        # TODO: No BN and more 1x1 conv in paper.
+        # Note: No BN and more 1x1 conv in paper.
         x_atten = self.conv_atten(x_atten)
         x_atten = x_fuse * x_atten
         x_out = x_atten + x_fuse
@@ -299,12 +299,12 @@ def __init__(self,
                  init_cfg=None):
 
         super(BiSeNetV1, self).__init__(init_cfg=init_cfg)
-        if len(spatial_channels) != 4:
-            raise AssertionError('Length of input channels of Spatial \
-                                 Path must be 4!')
-        if len(context_channels) != 3:
-            raise AssertionError('Length of input channels of Context \
-                                 Path must be 3!')
+        assert len(spatial_channels) == 4, 'Length of input channels \
+                                           of Spatial Path must be 4!'
+
+        assert len(context_channels) == 3, 'Length of input channels \
+                                           of Context Path must be 3!'
+
         self.out_indices = out_indices
         self.align_corners = align_corners
         self.context_path = ContextPath(backbone_cfg, context_channels,

From 8e8bae9b5d19954328029d588f48e7b0cabc118c Mon Sep 17 00:00:00 2001
From: MengzhangLI <mcmong@pku.edu.cn>
Date: Sat, 18 Sep 2021 01:24:57 +0800
Subject: [PATCH 05/11] Adding Assert

---
 configs/bisenetv1/README.md                   | 29 +++++++++++++
 configs/bisenetv1/bisenetv1.yml               | 43 +++++++++++++++++++
 ...1_r18-d32_4x8_1024x1024_160k_cityscapes.py | 11 +++++
 model-index.yml                               |  1 +
 4 files changed, 84 insertions(+)
 create mode 100644 configs/bisenetv1/README.md
 create mode 100644 configs/bisenetv1/bisenetv1.yml
 create mode 100644 configs/bisenetv1/bisenetv1_r18-d32_4x8_1024x1024_160k_cityscapes.py

diff --git a/configs/bisenetv1/README.md b/configs/bisenetv1/README.md
new file mode 100644
index 0000000000..08b723d685
--- /dev/null
+++ b/configs/bisenetv1/README.md
@@ -0,0 +1,29 @@
+# BiSeNet: Bilateral Segmentation Network for Real-time Semantic Segmentation
+
+## Introduction
+
+<!-- [ALGORITHM] -->
+
+```latex
+@inproceedings{yu2018bisenet,
+  title={Bisenet: Bilateral segmentation network for real-time semantic segmentation},
+  author={Yu, Changqian and Wang, Jingbo and Peng, Chao and Gao, Changxin and Yu, Gang and Sang, Nong},
+  booktitle={Proceedings of the European conference on computer vision (ECCV)},
+  pages={325--341},
+  year={2018}
+}
+```
+
+## Results and models
+
+### Cityscapes
+
+| Method    | Backbone  | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                  | download                                                                                                                                                                                                                                                       |
+| --------- | --------- | --------- | ------: | -------- | -------------- | ----: | ------------- | --------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| BiSeNetV1 (4x4) | R-18-D32 | 1024x1024  | 160000 | 3.3 | 31.77 | 74.37 | 76.91 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fast_scnn/fast_scnn_lr0.12_8x4_160k_cityscapes/fast_scnn_lr0.12_8x4_160k_cityscapes_20210630_164853-0cec9937.pth) &#124; [log](https://download.openmmlab.com/mmsegmentation/v0.5/fast_scnn/fast_scnn_lr0.12_8x4_160k_cityscapes/fast_scnn_lr0.12_8x4_160k_cityscapes_20210630_164853.log.json) |
+| BiSeNetV1 (4x8) | R-18-D32 | 1024x1024  | 160000 | 3.3 | - | 75.16 | 77.24 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv1/bisenetv1_r18-d32_4x8_1024x1024_160k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fast_scnn/fast_scnn_lr0.12_8x4_160k_cityscapes/fast_scnn_lr0.12_8x4_160k_cityscapes_20210630_164853-0cec9937.pth) &#124; [log](https://download.openmmlab.com/mmsegmentation/v0.5/fast_scnn/fast_scnn_lr0.12_8x4_160k_cityscapes/fast_scnn_lr0.12_8x4_160k_cityscapes_20210630_164853.log.json) |
+
+Note:
+
+- `4x4`: Using 4 GPUs with 4 samples per GPU in training.
+- `4x8`: Using 4 GPUs with 8 samples per GPU in training.
diff --git a/configs/bisenetv1/bisenetv1.yml b/configs/bisenetv1/bisenetv1.yml
new file mode 100644
index 0000000000..94f2e26bff
--- /dev/null
+++ b/configs/bisenetv1/bisenetv1.yml
@@ -0,0 +1,43 @@
+Collections:
+- Metadata:
+    Training Data:
+    - Cityscapes
+  Name: bisenetv1
+Models:
+- Config: configs/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes.py
+  In Collection: bisenetv1
+  Metadata:
+    backbone: R-18-D32
+    crop size: (1024,1024)
+    inference time (ms/im):
+    - backend: PyTorch
+      batch size: 1
+      hardware: V100
+      mode: FP32
+      resolution: (1024,1024)
+      value: 31.48
+    lr schd: 160000
+    memory (GB): 3.3
+  Name: bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes
+  Results:
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 74.37
+      mIoU(ms+flip): 76.91
+    Task: Semantic Segmentation
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fast_scnn/fast_scnn_lr0.12_8x4_160k_cityscapes/fast_scnn_lr0.12_8x4_160k_cityscapes_20210630_164853-0cec9937.pth
+- Config: configs/bisenetv1/bisenetv1_r18-d32_4x8_1024x1024_160k_cityscapes.py
+  In Collection: bisenetv1
+  Metadata:
+    backbone: R-18-D32
+    crop size: (1024,1024)
+    lr schd: 160000
+    memory (GB): 3.3
+  Name: bisenetv1_r18-d32_4x8_1024x1024_160k_cityscapes
+  Results:
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 75.16
+      mIoU(ms+flip): 77.24
+    Task: Semantic Segmentation
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fast_scnn/fast_scnn_lr0.12_8x4_160k_cityscapes/fast_scnn_lr0.12_8x4_160k_cityscapes_20210630_164853-0cec9937.pth
diff --git a/configs/bisenetv1/bisenetv1_r18-d32_4x8_1024x1024_160k_cityscapes.py b/configs/bisenetv1/bisenetv1_r18-d32_4x8_1024x1024_160k_cityscapes.py
new file mode 100644
index 0000000000..a495b7190f
--- /dev/null
+++ b/configs/bisenetv1/bisenetv1_r18-d32_4x8_1024x1024_160k_cityscapes.py
@@ -0,0 +1,11 @@
+_base_ = [
+    '../_base_/models/bisenetv1_r18-d32.py',
+    '../_base_/datasets/cityscapes_1024x1024.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+lr_config = dict(warmup='linear', warmup_iters=1000)
+optimizer = dict(lr=0.025)
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=8,
+)
diff --git a/model-index.yml b/model-index.yml
index d08ad33178..e6c0782275 100644
--- a/model-index.yml
+++ b/model-index.yml
@@ -1,6 +1,7 @@
 Import:
 - configs/ann/ann.yml
 - configs/apcnet/apcnet.yml
+- configs/bisenetv1/bisenetv1.yml
 - configs/ccnet/ccnet.yml
 - configs/cgnet/cgnet.yml
 - configs/danet/danet.yml

From c331fa0b456cb72a16f0a618f32c73bd72f0aea2 Mon Sep 17 00:00:00 2001
From: MengzhangLI <mcmong@pku.edu.cn>
Date: Wed, 22 Sep 2021 17:34:01 +0800
Subject: [PATCH 06/11] Adding Unittest

---
 mmseg/models/backbones/bisenetv1.py           |  8 ++-
 .../test_backbones/test_bisenetv1.py          | 55 +++++++++++++++++++
 2 files changed, 62 insertions(+), 1 deletion(-)

diff --git a/mmseg/models/backbones/bisenetv1.py b/mmseg/models/backbones/bisenetv1.py
index 519eed46da..3ac3fd8f4d 100644
--- a/mmseg/models/backbones/bisenetv1.py
+++ b/mmseg/models/backbones/bisenetv1.py
@@ -30,6 +30,9 @@ def __init__(self,
                  act_cfg=dict(type='ReLU'),
                  init_cfg=None):
         super(SpatialPath, self).__init__(init_cfg=init_cfg)
+        assert len(num_channels) == 4, 'Length of input channels \
+                                        of Spatial Path must be 4!'
+
         self.layers = []
         for i in range(len(num_channels)):
             layer_name = f'layer{i + 1}'
@@ -150,6 +153,9 @@ def __init__(self,
                  act_cfg=dict(type='ReLU'),
                  init_cfg=None):
         super(ContextPath, self).__init__(init_cfg=init_cfg)
+        assert len(context_channels) == 3, 'Length of input channels \
+                                           of Context Path must be 3!'
+
         self.backbone = build_backbone(backbone_cfg)
 
         self.align_corners = align_corners
@@ -267,7 +273,7 @@ class BiSeNetV1(BaseModule):
     Args:
         backbone_cfg:(dict): Config of backbone of
             Context Path.
-        in_channels(int): The number of channels of input
+        in_channels (int): The number of channels of input
             image. Default: 3.
         spatial_channels (Tuple[int]): Size of channel numbers of
             various layers in Spatial Path.
diff --git a/tests/test_models/test_backbones/test_bisenetv1.py b/tests/test_models/test_backbones/test_bisenetv1.py
index 591388e71a..3b5a71e56d 100644
--- a/tests/test_models/test_backbones/test_bisenetv1.py
+++ b/tests/test_models/test_backbones/test_bisenetv1.py
@@ -3,6 +3,9 @@
 import torch
 
 from mmseg.models.backbones import BiSeNetV1
+from mmseg.models.backbones.bisenetv1 import (AttentionRefinementModule,
+                                              ContextPath, FeatureFusionModule,
+                                              SpatialPath)
 
 
 def test_bisenetv1_backbone():
@@ -52,3 +55,55 @@ def test_bisenetv1_backbone():
             backbone_cfg=backbone_cfg,
             in_channels=3,
             context_channels=(128, 256, 512, 1024))
+
+
+def test_bisenetv1_spatial_path():
+    with pytest.raises(AssertionError):
+        # BiSeNetV1 spatial path channel constraints.
+        SpatialPath(num_channels=(64, 64, 64), in_channels=3)
+
+
+def test_bisenetv1_context_path():
+    backbone_cfg = dict(
+        type='ResNet',
+        in_channels=3,
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        dilations=(1, 1, 1, 1),
+        strides=(1, 2, 2, 2),
+        norm_eval=False,
+        style='pytorch',
+        contract_dilation=True)
+
+    with pytest.raises(AssertionError):
+        # BiSeNetV1 context path constraints.
+        ContextPath(
+            backbone_cfg=backbone_cfg, context_channels=(128, 256, 512, 1024))
+
+
+def test_bisenetv1_attention_refinement_module():
+    x_arm = AttentionRefinementModule(512, 128)
+    assert x_arm.conv_layer.in_channels == 512
+    assert x_arm.conv_layer.out_channels == 128
+    assert x_arm.conv_layer.kernel_size == (3, 3)
+    x = torch.randn(2, 512, 32, 64)
+    x_out = x_arm(x)
+    assert x_out.shape == torch.Size([2, 128, 32, 64])
+
+
+def test_bisenetv1_feature_fusion_module():
+    ffm = FeatureFusionModule(256, 512)
+    assert ffm.conv1.in_channels == 256
+    assert ffm.conv1.out_channels == 512
+    assert ffm.conv1.kernel_size == (1, 1)
+    assert ffm.gap.output_size == (1, 1)
+    assert ffm.conv_atten[0].in_channels == 512
+    assert ffm.conv_atten[0].out_channels == 512
+    assert ffm.conv_atten[0].kernel_size == (1, 1)
+
+    ffm = FeatureFusionModule(256, 256)
+    x1 = torch.randn(2, 128, 128, 256)
+    x2 = torch.randn(2, 128, 128, 256)
+    x_out = ffm(x1, x2)
+    assert x_out.shape == torch.Size([2, 256, 128, 256])

From ce3149367cfb7cea7f4c88d89a59e5fb3070dc28 Mon Sep 17 00:00:00 2001
From: MengzhangLI <mcmong@pku.edu.cn>
Date: Wed, 22 Sep 2021 22:38:59 +0800
Subject: [PATCH 07/11] Fixing typo

---
 mmseg/models/backbones/bisenetv1.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/mmseg/models/backbones/bisenetv1.py b/mmseg/models/backbones/bisenetv1.py
index 3ac3fd8f4d..4beb7b394d 100644
--- a/mmseg/models/backbones/bisenetv1.py
+++ b/mmseg/models/backbones/bisenetv1.py
@@ -13,18 +13,18 @@ class SpatialPath(BaseModule):
     and encode affluent spatial information.
 
     Args:
+        in_channels(int): The number of channels of input
+            image. Default: 3.
         num_channels (Tuple[int]): The number of channels of
             each layers in Spatial Path.
             Default: (64, 64, 64, 128).
-        in_channels(int): The number of channels of input
-            image. Default: 3.
     Returns:
         x (torch.Tensor): Feature map for Feature Fusion Module.
     """
 
     def __init__(self,
-                 num_channels=(64, 64, 64, 128),
                  in_channels=3,
+                 num_channels=(64, 64, 64, 128),
                  conv_cfg=None,
                  norm_cfg=dict(type='BN'),
                  act_cfg=dict(type='ReLU'),
@@ -315,7 +315,7 @@ def __init__(self,
         self.align_corners = align_corners
         self.context_path = ContextPath(backbone_cfg, context_channels,
                                         self.align_corners)
-        self.spatial_path = SpatialPath(spatial_channels, in_channels)
+        self.spatial_path = SpatialPath(in_channels, spatial_channels)
         self.ffm = FeatureFusionModule(context_channels[1], out_channels)
         self.conv_cfg = conv_cfg
         self.norm_cfg = norm_cfg

From 013c57daa6c9c1acc9a616e257130591e917759a Mon Sep 17 00:00:00 2001
From: MengzhangLI <mcmong@pku.edu.cn>
Date: Mon, 27 Sep 2021 21:15:46 +0800
Subject: [PATCH 08/11] Uploading models & logs

---
 configs/bisenetv1/README.md                   |  9 +-
 configs/bisenetv1/bisenetv1.yml               | 85 +++++++++++++++++--
 ...in1k-pre_4x4_1024x1024_160k_cityscapes.py} |  9 +-
 ..._in1k-pre_4x8_1024x1024_160k_cityscapes.py |  5 ++
 ...1_r50-d32_4x4_1024x1024_160k_cityscapes.py | 48 +++++++++++
 ..._in1k-pre_4x4_1024x1024_160k_cityscapes.py |  7 ++
 6 files changed, 152 insertions(+), 11 deletions(-)
 rename configs/bisenetv1/{bisenetv1_r18-d32_4x8_1024x1024_160k_cityscapes.py => bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py} (56%)
 create mode 100644 configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes.py
 create mode 100644 configs/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes.py
 create mode 100644 configs/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py

diff --git a/configs/bisenetv1/README.md b/configs/bisenetv1/README.md
index 08b723d685..051842a69b 100644
--- a/configs/bisenetv1/README.md
+++ b/configs/bisenetv1/README.md
@@ -20,10 +20,13 @@
 
 | Method    | Backbone  | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                  | download                                                                                                                                                                                                                                                       |
 | --------- | --------- | --------- | ------: | -------- | -------------- | ----: | ------------- | --------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| BiSeNetV1 (4x4) | R-18-D32 | 1024x1024  | 160000 | 3.3 | 31.77 | 74.37 | 76.91 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fast_scnn/fast_scnn_lr0.12_8x4_160k_cityscapes/fast_scnn_lr0.12_8x4_160k_cityscapes_20210630_164853-0cec9937.pth) &#124; [log](https://download.openmmlab.com/mmsegmentation/v0.5/fast_scnn/fast_scnn_lr0.12_8x4_160k_cityscapes/fast_scnn_lr0.12_8x4_160k_cityscapes_20210630_164853.log.json) |
-| BiSeNetV1 (4x8) | R-18-D32 | 1024x1024  | 160000 | 3.3 | - | 75.16 | 77.24 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv1/bisenetv1_r18-d32_4x8_1024x1024_160k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fast_scnn/fast_scnn_lr0.12_8x4_160k_cityscapes/fast_scnn_lr0.12_8x4_160k_cityscapes_20210630_164853-0cec9937.pth) &#124; [log](https://download.openmmlab.com/mmsegmentation/v0.5/fast_scnn/fast_scnn_lr0.12_8x4_160k_cityscapes/fast_scnn_lr0.12_8x4_160k_cityscapes_20210630_164853.log.json) |
+| BiSeNetV1 (ResNet18, train from scratch) | R-18-D32 | 1024x1024  | 160000 | 5.69 | 31.77 | 74.44 | 77.05 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes_20210922_172239-c55e78e2.pth) &#124; [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes_20210922_172239.log.json) |
+| BiSeNetV1 (ResNet18) | R-18-D32 | 1024x1024  | 160000 | 5.69 | 31.77 | 74.37 | 76.91 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210905_220251-8ba80eff.pth) &#124; [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210905_220251.log.json) |
+| BiSeNetV1 (ResNet18, 4x8) | R-18-D32 | 1024x1024  | 160000 | 11.17 | 31.77 | 75.16 | 77.24 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes_20210905_220322-bb8db75f.pth) &#124; [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes_20210905_220322.log.json) |
+| BiSeNetV1 (ResNet50, train from scratch) | R-50-D32 | 1024x1024  | 160000 | 3.3 | 7.71 | 76.92 | 78.87 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes_20210923_222639-7b28a2a6.pth) &#124; [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes_20210923_222639.log.json) |
+| BiSeNetV1 (ResNet50) | R-50-D32 | 1024x1024  | 160000 | 15.39 | 7.71 | 77.68 | 79.57 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210917_234628-8b304447.pth) &#124; [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210917_234628.log.json) |
 
 Note:
 
-- `4x4`: Using 4 GPUs with 4 samples per GPU in training.
 - `4x8`: Using 4 GPUs with 8 samples per GPU in training.
+- Default setting is 4 GPUs with 4 samples per GPU in training.
diff --git a/configs/bisenetv1/bisenetv1.yml b/configs/bisenetv1/bisenetv1.yml
index 94f2e26bff..6ce193a9a9 100644
--- a/configs/bisenetv1/bisenetv1.yml
+++ b/configs/bisenetv1/bisenetv1.yml
@@ -17,27 +17,100 @@ Models:
       resolution: (1024,1024)
       value: 31.48
     lr schd: 160000
-    memory (GB): 3.3
+    memory (GB): 5.69
   Name: bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes
+  Results:
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 74.44
+      mIoU(ms+flip): 77.05
+    Task: Semantic Segmentation
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes_20210922_172239-c55e78e2.pth
+- Config: configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py
+  In Collection: bisenetv1
+  Metadata:
+    backbone: R-18-D32
+    crop size: (1024,1024)
+    inference time (ms/im):
+    - backend: PyTorch
+      batch size: 1
+      hardware: V100
+      mode: FP32
+      resolution: (1024,1024)
+      value: 31.48
+    lr schd: 160000
+    memory (GB): 5.69
+  Name: bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes
   Results:
     Dataset: Cityscapes
     Metrics:
       mIoU: 74.37
       mIoU(ms+flip): 76.91
     Task: Semantic Segmentation
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fast_scnn/fast_scnn_lr0.12_8x4_160k_cityscapes/fast_scnn_lr0.12_8x4_160k_cityscapes_20210630_164853-0cec9937.pth
-- Config: configs/bisenetv1/bisenetv1_r18-d32_4x8_1024x1024_160k_cityscapes.py
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210905_220251-8ba80eff.pth
+- Config: configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes.py
   In Collection: bisenetv1
   Metadata:
     backbone: R-18-D32
     crop size: (1024,1024)
+    inference time (ms/im):
+    - backend: PyTorch
+      batch size: 1
+      hardware: V100
+      mode: FP32
+      resolution: (1024,1024)
+      value: 31.48
     lr schd: 160000
-    memory (GB): 3.3
-  Name: bisenetv1_r18-d32_4x8_1024x1024_160k_cityscapes
+    memory (GB): 11.17
+  Name: bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes
   Results:
     Dataset: Cityscapes
     Metrics:
       mIoU: 75.16
       mIoU(ms+flip): 77.24
     Task: Semantic Segmentation
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fast_scnn/fast_scnn_lr0.12_8x4_160k_cityscapes/fast_scnn_lr0.12_8x4_160k_cityscapes_20210630_164853-0cec9937.pth
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes_20210905_220322-bb8db75f.pth
+- Config: configs/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes.py
+  In Collection: bisenetv1
+  Metadata:
+    backbone: R-50-D32
+    crop size: (1024,1024)
+    inference time (ms/im):
+    - backend: PyTorch
+      batch size: 1
+      hardware: V100
+      mode: FP32
+      resolution: (1024,1024)
+      value: 129.7
+    lr schd: 160000
+    memory (GB): 3.3
+  Name: bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes
+  Results:
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 76.92
+      mIoU(ms+flip): 78.87
+    Task: Semantic Segmentation
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes_20210923_222639-7b28a2a6.pth
+- Config: configs/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py
+  In Collection: bisenetv1
+  Metadata:
+    backbone: R-50-D32
+    crop size: (1024,1024)
+    inference time (ms/im):
+    - backend: PyTorch
+      batch size: 1
+      hardware: V100
+      mode: FP32
+      resolution: (1024,1024)
+      value: 129.7
+    lr schd: 160000
+    memory (GB): 15.39
+  Name: bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes
+  Results:
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 77.68
+      mIoU(ms+flip): 79.57
+    Task: Semantic Segmentation
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210917_234628-8b304447.pth
diff --git a/configs/bisenetv1/bisenetv1_r18-d32_4x8_1024x1024_160k_cityscapes.py b/configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py
similarity index 56%
rename from configs/bisenetv1/bisenetv1_r18-d32_4x8_1024x1024_160k_cityscapes.py
rename to configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py
index a495b7190f..ef061a16bd 100644
--- a/configs/bisenetv1/bisenetv1_r18-d32_4x8_1024x1024_160k_cityscapes.py
+++ b/configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py
@@ -3,9 +3,14 @@
     '../_base_/datasets/cityscapes_1024x1024.py',
     '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
 ]
+model = dict(
+    backbone=dict(
+        backbone_cfg=dict(
+            init_cfg=dict(
+                type='Pretrained', checkpoint='open-mmlab://resnet18_v1c'))))
 lr_config = dict(warmup='linear', warmup_iters=1000)
 optimizer = dict(lr=0.025)
 data = dict(
-    samples_per_gpu=8,
-    workers_per_gpu=8,
+    samples_per_gpu=4,
+    workers_per_gpu=4,
 )
diff --git a/configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes.py b/configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes.py
new file mode 100644
index 0000000000..ea27ef0a11
--- /dev/null
+++ b/configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes.py
@@ -0,0 +1,5 @@
+_base_ = './bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py'
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=8,
+)
diff --git a/configs/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes.py b/configs/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes.py
new file mode 100644
index 0000000000..72e3024aef
--- /dev/null
+++ b/configs/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes.py
@@ -0,0 +1,48 @@
+_base_ = [
+    '../_base_/models/bisenetv1_r18-d32.py',
+    '../_base_/datasets/cityscapes_1024x1024.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    type='EncoderDecoder',
+    backbone=dict(
+        type='BiSeNetV1',
+        context_channels=(512, 1024, 2048),
+        spatial_channels=(256, 256, 256, 512),
+        out_channels=1024,
+        backbone_cfg=dict(
+            init_cfg=dict(type='Pretrained', checkpoint='open-mmlab://resnet50_v1c'),
+            type='ResNet',
+            depth=50)),
+    decode_head=dict(
+        type='FCNHead',
+        in_channels=1024,
+        in_index=0,
+        channels=1024),
+    auxiliary_head=[
+        dict(
+            type='FCNHead',
+            in_channels=512,
+            channels=256,
+            num_convs=1,
+            num_classes=19,
+            in_index=1,
+            norm_cfg=norm_cfg,
+            concat_input=False),
+        dict(
+            type='FCNHead',
+            in_channels=512,
+            channels=256,
+            num_convs=1,
+            num_classes=19,
+            in_index=2,
+            norm_cfg=norm_cfg,
+            concat_input=False),
+    ])
+lr_config = dict(warmup='linear', warmup_iters=1000)
+optimizer = dict(lr=0.05)
+data = dict(
+    samples_per_gpu=4,
+    workers_per_gpu=4,
+)
diff --git a/configs/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py b/configs/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py
new file mode 100644
index 0000000000..5625a76c08
--- /dev/null
+++ b/configs/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py
@@ -0,0 +1,7 @@
+_base_ = './bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes.py'
+model = dict(
+    type='EncoderDecoder',
+    backbone=dict(
+        backbone_cfg=dict(
+            init_cfg=dict(
+                type='Pretrained', checkpoint='open-mmlab://resnet50_v1c'))))

From ac1b2f73f469ac990e3b547a96a36d03af4eb2ae Mon Sep 17 00:00:00 2001
From: MengzhangLI <mcmong@pku.edu.cn>
Date: Tue, 28 Sep 2021 13:57:52 +0800
Subject: [PATCH 09/11] Fixing unittest error

---
 .../test_backbones/test_bisenetv1.py          | 38 +++++++++----------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/tests/test_models/test_backbones/test_bisenetv1.py b/tests/test_models/test_backbones/test_bisenetv1.py
index 3b5a71e56d..8e1571d6fb 100644
--- a/tests/test_models/test_backbones/test_bisenetv1.py
+++ b/tests/test_models/test_backbones/test_bisenetv1.py
@@ -25,20 +25,20 @@ def test_bisenetv1_backbone():
     model.init_weights()
     model.train()
     batch_size = 2
-    imgs = torch.randn(batch_size, 3, 512, 1024)
+    imgs = torch.randn(batch_size, 3, 256, 512)
     feat = model(imgs)
 
     assert len(feat) == 3
     # output for segment Head
-    assert feat[0].shape == torch.Size([batch_size, 256, 64, 128])
+    assert feat[0].shape == torch.Size([batch_size, 256, 32, 64])
     # for auxiliary head 1
-    assert feat[1].shape == torch.Size([batch_size, 128, 64, 128])
+    assert feat[1].shape == torch.Size([batch_size, 128, 32, 64])
     # for auxiliary head 2
-    assert feat[2].shape == torch.Size([batch_size, 128, 32, 64])
+    assert feat[2].shape == torch.Size([batch_size, 128, 16, 32])
 
     # Test input with rare shape
     batch_size = 2
-    imgs = torch.randn(batch_size, 3, 952, 527)
+    imgs = torch.randn(batch_size, 3, 527, 279)
     feat = model(imgs)
     assert len(feat) == 3
 
@@ -83,27 +83,27 @@ def test_bisenetv1_context_path():
 
 
 def test_bisenetv1_attention_refinement_module():
-    x_arm = AttentionRefinementModule(512, 128)
-    assert x_arm.conv_layer.in_channels == 512
-    assert x_arm.conv_layer.out_channels == 128
+    x_arm = AttentionRefinementModule(256, 64)
+    assert x_arm.conv_layer.in_channels == 256
+    assert x_arm.conv_layer.out_channels == 64
     assert x_arm.conv_layer.kernel_size == (3, 3)
-    x = torch.randn(2, 512, 32, 64)
+    x = torch.randn(2, 256, 32, 64)
     x_out = x_arm(x)
-    assert x_out.shape == torch.Size([2, 128, 32, 64])
+    assert x_out.shape == torch.Size([2, 64, 32, 64])
 
 
 def test_bisenetv1_feature_fusion_module():
-    ffm = FeatureFusionModule(256, 512)
-    assert ffm.conv1.in_channels == 256
-    assert ffm.conv1.out_channels == 512
+    ffm = FeatureFusionModule(128, 256)
+    assert ffm.conv1.in_channels == 128
+    assert ffm.conv1.out_channels == 256
     assert ffm.conv1.kernel_size == (1, 1)
     assert ffm.gap.output_size == (1, 1)
-    assert ffm.conv_atten[0].in_channels == 512
-    assert ffm.conv_atten[0].out_channels == 512
+    assert ffm.conv_atten[0].in_channels == 256
+    assert ffm.conv_atten[0].out_channels == 256
     assert ffm.conv_atten[0].kernel_size == (1, 1)
 
-    ffm = FeatureFusionModule(256, 256)
-    x1 = torch.randn(2, 128, 128, 256)
-    x2 = torch.randn(2, 128, 128, 256)
+    ffm = FeatureFusionModule(128, 128)
+    x1 = torch.randn(2, 64, 64, 128)
+    x2 = torch.randn(2, 64, 64, 128)
     x_out = ffm(x1, x2)
-    assert x_out.shape == torch.Size([2, 256, 128, 256])
+    assert x_out.shape == torch.Size([2, 128, 64, 128])

From f28204d3582e082a1fcbc91522b24d60a9ea4682 Mon Sep 17 00:00:00 2001
From: MengzhangLI <mcmong@pku.edu.cn>
Date: Tue, 28 Sep 2021 20:19:40 +0800
Subject: [PATCH 10/11] changing README.md

---
 configs/bisenetv1/README.md     | 20 ++++++--
 configs/bisenetv1/bisenetv1.yml | 83 ++++++++++++++++++---------------
 2 files changed, 61 insertions(+), 42 deletions(-)

diff --git a/configs/bisenetv1/README.md b/configs/bisenetv1/README.md
index 051842a69b..cd0ebd526b 100644
--- a/configs/bisenetv1/README.md
+++ b/configs/bisenetv1/README.md
@@ -4,6 +4,13 @@
 
 <!-- [ALGORITHM] -->
 
+<a href="https://github.com/ycszen/TorchSeg/tree/master/model/bisenet">Official Repo</a>
+
+<a href="https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/backbones/bisenetv1.py#L266">Code Snippet</a>
+
+<details>
+<summary align="right"><a href="https://arxiv.org/abs/1808.00897">BiSeNetV1 (ECCV'2018)</a></summary>
+
 ```latex
 @inproceedings{yu2018bisenet,
   title={Bisenet: Bilateral segmentation network for real-time semantic segmentation},
@@ -14,19 +21,22 @@
 }
 ```
 
+</details>
+
 ## Results and models
 
 ### Cityscapes
 
 | Method    | Backbone  | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                  | download                                                                                                                                                                                                                                                       |
 | --------- | --------- | --------- | ------: | -------- | -------------- | ----: | ------------- | --------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| BiSeNetV1 (ResNet18, train from scratch) | R-18-D32 | 1024x1024  | 160000 | 5.69 | 31.77 | 74.44 | 77.05 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes_20210922_172239-c55e78e2.pth) &#124; [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes_20210922_172239.log.json) |
-| BiSeNetV1 (ResNet18) | R-18-D32 | 1024x1024  | 160000 | 5.69 | 31.77 | 74.37 | 76.91 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210905_220251-8ba80eff.pth) &#124; [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210905_220251.log.json) |
-| BiSeNetV1 (ResNet18, 4x8) | R-18-D32 | 1024x1024  | 160000 | 11.17 | 31.77 | 75.16 | 77.24 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes_20210905_220322-bb8db75f.pth) &#124; [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes_20210905_220322.log.json) |
-| BiSeNetV1 (ResNet50, train from scratch) | R-50-D32 | 1024x1024  | 160000 | 3.3 | 7.71 | 76.92 | 78.87 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes_20210923_222639-7b28a2a6.pth) &#124; [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes_20210923_222639.log.json) |
-| BiSeNetV1 (ResNet50) | R-50-D32 | 1024x1024  | 160000 | 15.39 | 7.71 | 77.68 | 79.57 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210917_234628-8b304447.pth) &#124; [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210917_234628.log.json) |
+| BiSeNetV1 (No Pretrain) | R-18-D32 | 1024x1024  | 160000 | 5.69 | 31.77 | 74.44 | 77.05 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes_20210922_172239-c55e78e2.pth) &#124; [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes_20210922_172239.log.json) |
+| BiSeNetV1| R-18-D32 | 1024x1024  | 160000 | 5.69 | 31.77 | 74.37 | 76.91 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210905_220251-8ba80eff.pth) &#124; [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210905_220251.log.json) |
+| BiSeNetV1 (4x8) | R-18-D32 | 1024x1024  | 160000 | 11.17 | 31.77 | 75.16 | 77.24 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes_20210905_220322-bb8db75f.pth) &#124; [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes_20210905_220322.log.json) |
+| BiSeNetV1 (No Pretrain) | R-50-D32 | 1024x1024  | 160000 | 3.3 | 7.71 | 76.92 | 78.87 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes_20210923_222639-7b28a2a6.pth) &#124; [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes_20210923_222639.log.json) |
+| BiSeNetV1 | R-50-D32 | 1024x1024  | 160000 | 15.39 | 7.71 | 77.68 | 79.57 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210917_234628-8b304447.pth) &#124; [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210917_234628.log.json) |
 
 Note:
 
 - `4x8`: Using 4 GPUs with 8 samples per GPU in training.
 - Default setting is 4 GPUs with 4 samples per GPU in training.
+-  `No Pretrain` means the model is trained from scratch.
diff --git a/configs/bisenetv1/bisenetv1.yml b/configs/bisenetv1/bisenetv1.yml
index 6ce193a9a9..6de872b863 100644
--- a/configs/bisenetv1/bisenetv1.yml
+++ b/configs/bisenetv1/bisenetv1.yml
@@ -1,116 +1,125 @@
 Collections:
-- Metadata:
+- Name: bisenetv1
+  Metadata:
     Training Data:
     - Cityscapes
-  Name: bisenetv1
+  Paper:
+    URL: https://arxiv.org/abs/1808.00897
+    Title: 'BiSeNet: Bilateral Segmentation Network for Real-time Semantic Segmentation'
+  README: configs/bisenetv1/README.md
+  Code:
+    URL: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/backbones/bisenetv1.py#L266
+    Version: v0.18.0
+  Converted From:
+    Code: https://github.com/ycszen/TorchSeg/tree/master/model/bisenet
 Models:
-- Config: configs/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes.py
+- Name: bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes
   In Collection: bisenetv1
   Metadata:
     backbone: R-18-D32
     crop size: (1024,1024)
+    lr schd: 160000
     inference time (ms/im):
-    - backend: PyTorch
-      batch size: 1
+    - value: 31.48
       hardware: V100
+      backend: PyTorch
+      batch size: 1
       mode: FP32
       resolution: (1024,1024)
-      value: 31.48
-    lr schd: 160000
     memory (GB): 5.69
-  Name: bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes
   Results:
+  - Task: Semantic Segmentation
     Dataset: Cityscapes
     Metrics:
       mIoU: 74.44
       mIoU(ms+flip): 77.05
-    Task: Semantic Segmentation
+  Config: configs/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes.py
   Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes_20210922_172239-c55e78e2.pth
-- Config: configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py
+- Name: bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes
   In Collection: bisenetv1
   Metadata:
     backbone: R-18-D32
     crop size: (1024,1024)
+    lr schd: 160000
     inference time (ms/im):
-    - backend: PyTorch
-      batch size: 1
+    - value: 31.48
       hardware: V100
+      backend: PyTorch
+      batch size: 1
       mode: FP32
       resolution: (1024,1024)
-      value: 31.48
-    lr schd: 160000
     memory (GB): 5.69
-  Name: bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes
   Results:
+  - Task: Semantic Segmentation
     Dataset: Cityscapes
     Metrics:
       mIoU: 74.37
       mIoU(ms+flip): 76.91
-    Task: Semantic Segmentation
+  Config: configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py
   Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210905_220251-8ba80eff.pth
-- Config: configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes.py
+- Name: bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes
   In Collection: bisenetv1
   Metadata:
     backbone: R-18-D32
     crop size: (1024,1024)
+    lr schd: 160000
     inference time (ms/im):
-    - backend: PyTorch
-      batch size: 1
+    - value: 31.48
       hardware: V100
+      backend: PyTorch
+      batch size: 1
       mode: FP32
       resolution: (1024,1024)
-      value: 31.48
-    lr schd: 160000
     memory (GB): 11.17
-  Name: bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes
   Results:
+  - Task: Semantic Segmentation
     Dataset: Cityscapes
     Metrics:
       mIoU: 75.16
       mIoU(ms+flip): 77.24
-    Task: Semantic Segmentation
+  Config: configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes.py
   Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes_20210905_220322-bb8db75f.pth
-- Config: configs/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes.py
+- Name: bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes
   In Collection: bisenetv1
   Metadata:
     backbone: R-50-D32
     crop size: (1024,1024)
+    lr schd: 160000
     inference time (ms/im):
-    - backend: PyTorch
-      batch size: 1
+    - value: 129.7
       hardware: V100
+      backend: PyTorch
+      batch size: 1
       mode: FP32
       resolution: (1024,1024)
-      value: 129.7
-    lr schd: 160000
     memory (GB): 3.3
-  Name: bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes
   Results:
+  - Task: Semantic Segmentation
     Dataset: Cityscapes
     Metrics:
       mIoU: 76.92
       mIoU(ms+flip): 78.87
-    Task: Semantic Segmentation
+  Config: configs/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes.py
   Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes_20210923_222639-7b28a2a6.pth
-- Config: configs/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py
+- Name: bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes
   In Collection: bisenetv1
   Metadata:
     backbone: R-50-D32
     crop size: (1024,1024)
+    lr schd: 160000
     inference time (ms/im):
-    - backend: PyTorch
-      batch size: 1
+    - value: 129.7
       hardware: V100
+      backend: PyTorch
+      batch size: 1
       mode: FP32
       resolution: (1024,1024)
-      value: 129.7
-    lr schd: 160000
     memory (GB): 15.39
-  Name: bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes
   Results:
+  - Task: Semantic Segmentation
     Dataset: Cityscapes
     Metrics:
       mIoU: 77.68
       mIoU(ms+flip): 79.57
-    Task: Semantic Segmentation
+  Config: configs/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py
   Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210917_234628-8b304447.pth

From 79b00c08622a4a0b4a48b9e3a299eabe6424f98a Mon Sep 17 00:00:00 2001
From: MengzhangLI <mcmong@pku.edu.cn>
Date: Tue, 28 Sep 2021 20:21:25 +0800
Subject: [PATCH 11/11] changing README.md

---
 configs/bisenetv1/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configs/bisenetv1/README.md b/configs/bisenetv1/README.md
index cd0ebd526b..344781068a 100644
--- a/configs/bisenetv1/README.md
+++ b/configs/bisenetv1/README.md
@@ -39,4 +39,4 @@ Note:
 
 - `4x8`: Using 4 GPUs with 8 samples per GPU in training.
 - Default setting is 4 GPUs with 4 samples per GPU in training.
--  `No Pretrain` means the model is trained from scratch.
+- `No Pretrain` means the model is trained from scratch.