From a44c172e21a105761f4af02eab0ac55b678c3839 Mon Sep 17 00:00:00 2001 From: MengzhangLI Date: Fri, 3 Sep 2021 15:34:58 +0800 Subject: [PATCH 01/11] First Commit --- .../_base_/datasets/cityscapes_1024x1024.py | 35 ++ configs/_base_/models/bisenetv1_r18-d32.py | 67 ++++ ...1_r18-d32_4x4_1024x1024_160k_cityscapes.py | 11 + mmseg/models/backbones/__init__.py | 3 +- mmseg/models/backbones/bisenetv1.py | 302 ++++++++++++++++++ .../test_backbones/test_bisenetv1.py | 39 +++ 6 files changed, 456 insertions(+), 1 deletion(-) create mode 100644 configs/_base_/datasets/cityscapes_1024x1024.py create mode 100644 configs/_base_/models/bisenetv1_r18-d32.py create mode 100644 configs/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes.py create mode 100644 mmseg/models/backbones/bisenetv1.py create mode 100644 tests/test_models/test_backbones/test_bisenetv1.py diff --git a/configs/_base_/datasets/cityscapes_1024x1024.py b/configs/_base_/datasets/cityscapes_1024x1024.py new file mode 100644 index 0000000000..f98d929723 --- /dev/null +++ b/configs/_base_/datasets/cityscapes_1024x1024.py @@ -0,0 +1,35 @@ +_base_ = './cityscapes.py' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) +crop_size = (1024, 1024) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadAnnotations'), + dict(type='Resize', img_scale=(2048, 1024), ratio_range=(0.5, 2.0)), + dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75), + dict(type='RandomFlip', prob=0.5), + dict(type='PhotoMetricDistortion'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_semantic_seg']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(2048, 1024), + # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75], + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict( + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) diff --git a/configs/_base_/models/bisenetv1_r18-d32.py b/configs/_base_/models/bisenetv1_r18-d32.py new file mode 100644 index 0000000000..e9a62def60 --- /dev/null +++ b/configs/_base_/models/bisenetv1_r18-d32.py @@ -0,0 +1,67 @@ +# model settings +norm_cfg = dict(type='SyncBN', requires_grad=True) +model = dict( + type='EncoderDecoder', + backbone=dict( + type='BiSeNetV1', + in_channel=3, + context_channels=(128, 256, 512), + spatial_channels=(64, 64, 64, 128), + out_indices=(0, 1, 2), + backbone_cfg=dict( + type='ResNet', + in_channels=3, + depth=18, + num_stages=4, + out_indices=(0, 1, 2, 3), + dilations=(1, 1, 1, 1), + strides=(1, 2, 2, 2), + norm_cfg=norm_cfg, + norm_eval=False, + style='pytorch', + contract_dilation=True), + norm_cfg=norm_cfg, + align_corners=False, + init_cfg=None), + decode_head=dict( + type='FCNHead', + in_channels=256, + in_index=0, + channels=256, + num_convs=1, + concat_input=False, + dropout_ratio=0.1, + num_classes=19, + norm_cfg=norm_cfg, + align_corners=False, + loss_decode=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), + auxiliary_head=[ + dict( + type='FCNHead', + in_channels=128, + channels=64, + num_convs=1, + num_classes=19, + in_index=1, + norm_cfg=norm_cfg, + concat_input=False, + align_corners=False, + loss_decode=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), + dict( + type='FCNHead', + in_channels=128, + channels=64, + num_convs=1, + num_classes=19, + in_index=2, + norm_cfg=norm_cfg, + concat_input=False, + align_corners=False, + loss_decode=dict( + type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), + ], + # model training and testing settings + train_cfg=dict(), + test_cfg=dict(mode='whole')) diff --git a/configs/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes.py b/configs/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes.py new file mode 100644 index 0000000000..f4019e930e --- /dev/null +++ b/configs/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes.py @@ -0,0 +1,11 @@ +_base_ = [ + '../_base_/models/bisenetv1_r18-d32.py', + '../_base_/datasets/cityscapes_1024x1024.py', + '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py' +] +lr_config = dict(warmup='linear', warmup_iters=1000) +optimizer = dict(lr=0.025) +data = dict( + samples_per_gpu=4, + workers_per_gpu=4, +) diff --git a/mmseg/models/backbones/__init__.py b/mmseg/models/backbones/__init__.py index 75ef2c3a86..5e4373291d 100644 --- a/mmseg/models/backbones/__init__.py +++ b/mmseg/models/backbones/__init__.py @@ -1,4 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. +from .bisenetv1 import BiSeNetV1 from .cgnet import CGNet from .fast_scnn import FastSCNN from .hrnet import HRNet @@ -15,5 +16,5 @@ __all__ = [ 'ResNet', 'ResNetV1c', 'ResNetV1d', 'ResNeXt', 'HRNet', 'FastSCNN', 'ResNeSt', 'MobileNetV2', 'UNet', 'CGNet', 'MobileNetV3', - 'VisionTransformer', 'SwinTransformer', 'MixVisionTransformer' + 'VisionTransformer', 'SwinTransformer', 'MixVisionTransformer', 'BiSeNetV1' ] diff --git a/mmseg/models/backbones/bisenetv1.py b/mmseg/models/backbones/bisenetv1.py new file mode 100644 index 0000000000..20077d387b --- /dev/null +++ b/mmseg/models/backbones/bisenetv1.py @@ -0,0 +1,302 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn +from mmcv.cnn import ConvModule +from mmcv.runner import BaseModule + +from mmseg.ops import resize +from ..builder import BACKBONES, build_backbone + + +class SpatialPath(BaseModule): + """Spatial Path to preserve the spatial size of the original input image + and encode affluent spatial information. + + Args: + spatial_channels (Tuple[int]): Size of channel numbers of + various layers in Spatial Path. + Default: (64, 64, 64, 128). + in_channel(int): Channel of input image. Default: 3. + Returns: + x (torch.Tensor): Feature map for Feature Fusion Module. + """ + + def __init__(self, + spatial_channels=(64, 64, 64, 128), + in_channel=3, + conv_cfg=None, + norm_cfg=dict(type='BN'), + act_cfg=dict(type='ReLU'), + init_cfg=None): + super(SpatialPath, self).__init__(init_cfg=init_cfg) + self.layer_stages = [] + for i in range(len(spatial_channels)): + layer_name = f'layer{i + 1}' + self.layer_stages.append(layer_name) + if i == 0: + self.add_module( + layer_name, + ConvModule( + in_channels=in_channel, + out_channels=spatial_channels[i], + kernel_size=7, + stride=2, + padding=3, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg)) + elif i != len(spatial_channels) - 1: + self.add_module( + layer_name, + ConvModule( + in_channels=spatial_channels[i - 1], + out_channels=spatial_channels[i], + kernel_size=3, + stride=2, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg)) + else: + self.add_module( + layer_name, + ConvModule( + in_channels=spatial_channels[i - 1], + out_channels=spatial_channels[i], + kernel_size=1, + stride=1, + padding=0, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg)) + + def forward(self, x): + for i, layer_name in enumerate(self.layer_stages): + layer_stage = getattr(self, layer_name) + x = layer_stage(x) + return x + + +class AttentionRefinementModule(BaseModule): + """Attention Refinement Module (ARM) to refine the features of each stage. + + Args: + in_channel (int): Number of input channels. + out_channels (int): Number of output channels. + Returns: + x_out (torch.Tensor): Feature map of Attention Refinement Module. + """ + + def __init__(self, + in_channel, + out_channel, + conv_cfg=None, + norm_cfg=dict(type='BN'), + act_cfg=dict(type='ReLU'), + init_cfg=None): + super(AttentionRefinementModule, self).__init__(init_cfg=init_cfg) + self.conv_layer = ConvModule( + in_channels=in_channel, + out_channels=out_channel, + kernel_size=3, + stride=1, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + self.atten_conv_layer = nn.Sequential( + nn.AdaptiveAvgPool2d((1, 1)), + ConvModule( + in_channels=out_channel, + out_channels=out_channel, + kernel_size=1, + bias=False, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=None), nn.Sigmoid()) + + def forward(self, x): + x = self.conv_layer(x) + x_atten = self.atten_conv_layer(x) + x_out = torch.mul(x, x_atten) + return x_out + + +class ContextPath(BaseModule): + """Context Path to provide sufficient receptive field. + + Args: + backbone_cfg:(dict | None): Config of backbone of + Context Path. + context_channels (Tuple[int]): Size of channel numbers of + various modules in Context Path. + Default: (128, 256, 512). + align_corners (bool, optional): The align_corners argument of + resize operation. Default: False. + Returns: + [x_16_up, x_32_up] (List[torch.Tensor]): List of two feature + maps for Feature Fusion Module and Auxiliary Head. + """ + + def __init__(self, + backbone_cfg, + context_channels=(128, 256, 512), + align_corners=False, + conv_cfg=None, + norm_cfg=dict(type='BN'), + act_cfg=dict(type='ReLU'), + init_cfg=None): + super(ContextPath, self).__init__(init_cfg=init_cfg) + self.backbone = build_backbone(backbone_cfg) + + self.align_corners = align_corners + self.arm16 = AttentionRefinementModule(context_channels[1], + context_channels[0]) + self.arm32 = AttentionRefinementModule(context_channels[2], + context_channels[0]) + self.conv_head32 = ConvModule( + in_channels=context_channels[0], + out_channels=context_channels[0], + kernel_size=3, + stride=1, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + self.conv_head16 = ConvModule( + in_channels=context_channels[0], + out_channels=context_channels[0], + kernel_size=3, + stride=1, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + self.gap_conv = nn.Sequential( + nn.AdaptiveAvgPool2d((1, 1)), + ConvModule( + in_channels=context_channels[2], + out_channels=context_channels[0], + kernel_size=1, + stride=1, + padding=0, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg)) + + def forward(self, x): + x_4, x_8, x_16, x_32 = self.backbone(x) + x_gap = self.gap_conv(x_32) + + x_32_arm = self.arm32(x_32) + x_32_sum = x_32_arm + x_gap + x_32_up = resize(input=x_32_sum, size=x_16.shape[2:], mode='nearest') + x_32_up = self.conv_head32(x_32_up) + + x_16_arm = self.arm16(x_16) + x_16_sum = x_16_arm + x_32_up + x_16_up = resize(input=x_16_sum, size=x_8.shape[2:], mode='nearest') + x_16_up = self.conv_head16(x_16_up) + + return [x_16_up, x_32_up] + + +class FeatureFusionModule(BaseModule): + """Feature Fusion Module to fuse low level output feature of Spatial Path + and high level output feature of Context Path. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + Returns: + x_out (torch.Tensor): Feature map of Feature Fusion Module. + """ + + def __init__(self, + in_channels, + out_channels, + conv_cfg=None, + norm_cfg=dict(type='BN'), + act_cfg=dict(type='ReLU'), + init_cfg=None): + super(FeatureFusionModule, self).__init__(init_cfg=init_cfg) + self.conv1 = ConvModule( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + stride=1, + padding=0, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + self.gap = nn.AdaptiveAvgPool2d((1, 1)) + # use conv-bn instead of 2 layer mlp, + # so that tensorrt 7.2.3.4 can work for fp16 + self.conv_atten = nn.Sequential( + ConvModule( + in_channels=out_channels, + out_channels=out_channels, + kernel_size=1, + stride=1, + padding=0, + bias=False, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg), nn.Sigmoid()) + + def forward(self, x_sp, x_cp): + x_concat = torch.cat([x_sp, x_cp], dim=1) + x_fuse = self.conv1(x_concat) + x_atten = self.gap(x_fuse) + # TODO: No BN and more 1x1 conv in paper. + x_atten = self.conv_atten(x_atten) + x_atten = torch.mul(x_fuse, x_atten) + x_out = x_atten + x_fuse + return x_out + + +@BACKBONES.register_module() +class BiSeNetV1(BaseModule): + """BiSeNetV1 backbone. + + This backbone is the implementation of `BiSeNet: Bilateral + Segmentation Network for Real-time Semantic + Segmentation `_. + + Args: + align_corners (bool, optional): The align_corners argument of + resize operation in Bilateral Guided Aggregation Layer. + Default: False. + """ + + def __init__(self, + in_channel=3, + backbone_cfg=None, + spatial_channels=(64, 64, 64, 128), + context_channels=(128, 256, 512), + out_indices=(0, 1, 2), + align_corners=False, + conv_cfg=None, + norm_cfg=dict(type='BN', requires_grad=True), + act_cfg=dict(type='ReLU'), + init_cfg=None): + + super(BiSeNetV1, self).__init__(init_cfg=init_cfg) + self.out_indices = out_indices + self.align_corners = align_corners + self.context_path = ContextPath(backbone_cfg, context_channels, + self.align_corners) + self.spatial_path = SpatialPath(spatial_channels, in_channel) + self.ffm = FeatureFusionModule(256, 256) + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + + def forward(self, x): + x_context8, x_context16 = self.context_path(x) + x_spatial = self.spatial_path(x) + x_fuse = self.ffm(x_spatial, x_context8) + + outs = [x_fuse] + [x_context8, x_context16] + outs = [outs[i] for i in self.out_indices] + return tuple(outs) diff --git a/tests/test_models/test_backbones/test_bisenetv1.py b/tests/test_models/test_backbones/test_bisenetv1.py new file mode 100644 index 0000000000..9bcf9e6d73 --- /dev/null +++ b/tests/test_models/test_backbones/test_bisenetv1.py @@ -0,0 +1,39 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch + +from mmseg.models.backbones import BiSeNetV1 + + +def test_bisenetv2_backbone(): + # Test BiSeNetV1 Standard Forward + backbone_cfg = dict( + type='ResNet', + in_channels=3, + depth=18, + num_stages=4, + out_indices=(0, 1, 2, 3), + dilations=(1, 1, 1, 1), + strides=(1, 2, 2, 2), + norm_eval=False, + style='pytorch', + contract_dilation=True) + model = BiSeNetV1(in_channel=3, backbone_cfg=backbone_cfg) + model.init_weights() + model.train() + batch_size = 2 + imgs = torch.randn(batch_size, 3, 512, 1024) + feat = model(imgs) + + assert len(feat) == 3 + # output for segment Head + assert feat[0].shape == torch.Size([batch_size, 256, 64, 128]) + # for auxiliary head 1 + assert feat[1].shape == torch.Size([batch_size, 128, 64, 128]) + # for auxiliary head 2 + assert feat[2].shape == torch.Size([batch_size, 128, 32, 64]) + + # Test input with rare shape + batch_size = 2 + imgs = torch.randn(batch_size, 3, 952, 527) + feat = model(imgs) + assert len(feat) == 3 From dfdb981124ac8de8eb75e097c774bb039e62b6f2 Mon Sep 17 00:00:00 2001 From: MengzhangLI Date: Fri, 3 Sep 2021 17:53:22 +0800 Subject: [PATCH 02/11] fix typos --- mmseg/models/backbones/bisenetv1.py | 11 +++++++++++ tests/test_models/test_backbones/test_bisenetv1.py | 2 +- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/mmseg/models/backbones/bisenetv1.py b/mmseg/models/backbones/bisenetv1.py index 20077d387b..9ed586e141 100644 --- a/mmseg/models/backbones/bisenetv1.py +++ b/mmseg/models/backbones/bisenetv1.py @@ -264,6 +264,17 @@ class BiSeNetV1(BaseModule): Segmentation `_. Args: + in_channel(int): Channel of input image. Default: 3. + backbone_cfg:(dict | None): Config of backbone of + Context Path. + spatial_channels (Tuple[int]): Size of channel numbers of + various layers in Spatial Path. + Default: (64, 64, 64, 128). + context_channels (Tuple[int]): Size of channel numbers of + various modules in Context Path. + Default: (128, 256, 512). + out_indices (Tuple[int] | int, optional): Output from which stages. + Default: (0, 1, 2). align_corners (bool, optional): The align_corners argument of resize operation in Bilateral Guided Aggregation Layer. Default: False. diff --git a/tests/test_models/test_backbones/test_bisenetv1.py b/tests/test_models/test_backbones/test_bisenetv1.py index 9bcf9e6d73..e6ff2f90c8 100644 --- a/tests/test_models/test_backbones/test_bisenetv1.py +++ b/tests/test_models/test_backbones/test_bisenetv1.py @@ -4,7 +4,7 @@ from mmseg.models.backbones import BiSeNetV1 -def test_bisenetv2_backbone(): +def test_bisenetv1_backbone(): # Test BiSeNetV1 Standard Forward backbone_cfg = dict( type='ResNet', From 3d241e0135d88ec47631e6ac55d6c9472414f57d Mon Sep 17 00:00:00 2001 From: MengzhangLI Date: Fri, 3 Sep 2021 20:01:15 +0800 Subject: [PATCH 03/11] fix typos --- configs/_base_/models/bisenetv1_r18-d32.py | 3 +- mmseg/models/backbones/bisenetv1.py | 103 ++++++++++-------- .../test_backbones/test_bisenetv1.py | 17 ++- 3 files changed, 76 insertions(+), 47 deletions(-) diff --git a/configs/_base_/models/bisenetv1_r18-d32.py b/configs/_base_/models/bisenetv1_r18-d32.py index e9a62def60..40698644ba 100644 --- a/configs/_base_/models/bisenetv1_r18-d32.py +++ b/configs/_base_/models/bisenetv1_r18-d32.py @@ -4,10 +4,11 @@ type='EncoderDecoder', backbone=dict( type='BiSeNetV1', - in_channel=3, + in_channels=3, context_channels=(128, 256, 512), spatial_channels=(64, 64, 64, 128), out_indices=(0, 1, 2), + out_channels=256, backbone_cfg=dict( type='ResNet', in_channels=3, diff --git a/mmseg/models/backbones/bisenetv1.py b/mmseg/models/backbones/bisenetv1.py index 9ed586e141..2776c6aaed 100644 --- a/mmseg/models/backbones/bisenetv1.py +++ b/mmseg/models/backbones/bisenetv1.py @@ -13,47 +13,48 @@ class SpatialPath(BaseModule): and encode affluent spatial information. Args: - spatial_channels (Tuple[int]): Size of channel numbers of - various layers in Spatial Path. + num_channels (Tuple[int]): The number of channels of + each layers in Spatial Path. Default: (64, 64, 64, 128). - in_channel(int): Channel of input image. Default: 3. + in_channels(int): The number of channels of input + image. Default: 3. Returns: x (torch.Tensor): Feature map for Feature Fusion Module. """ def __init__(self, - spatial_channels=(64, 64, 64, 128), - in_channel=3, + num_channels=(64, 64, 64, 128), + in_channels=3, conv_cfg=None, norm_cfg=dict(type='BN'), act_cfg=dict(type='ReLU'), init_cfg=None): super(SpatialPath, self).__init__(init_cfg=init_cfg) - self.layer_stages = [] - for i in range(len(spatial_channels)): + self.layers = [] + for i in range(len(num_channels)): layer_name = f'layer{i + 1}' - self.layer_stages.append(layer_name) + self.layers.append(layer_name) if i == 0: self.add_module( layer_name, ConvModule( - in_channels=in_channel, - out_channels=spatial_channels[i], + in_channels=in_channels, + out_channels=num_channels[i], kernel_size=7, stride=2, padding=3, conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg)) - elif i != len(spatial_channels) - 1: + elif i == len(num_channels) - 1: self.add_module( layer_name, ConvModule( - in_channels=spatial_channels[i - 1], - out_channels=spatial_channels[i], - kernel_size=3, - stride=2, - padding=1, + in_channels=num_channels[i - 1], + out_channels=num_channels[i], + kernel_size=1, + stride=1, + padding=0, conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg)) @@ -61,17 +62,17 @@ def __init__(self, self.add_module( layer_name, ConvModule( - in_channels=spatial_channels[i - 1], - out_channels=spatial_channels[i], - kernel_size=1, - stride=1, - padding=0, + in_channels=num_channels[i - 1], + out_channels=num_channels[i], + kernel_size=3, + stride=2, + padding=1, conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg)) def forward(self, x): - for i, layer_name in enumerate(self.layer_stages): + for i, layer_name in enumerate(self.layers): layer_stage = getattr(self, layer_name) x = layer_stage(x) return x @@ -81,14 +82,14 @@ class AttentionRefinementModule(BaseModule): """Attention Refinement Module (ARM) to refine the features of each stage. Args: - in_channel (int): Number of input channels. - out_channels (int): Number of output channels. + in_channels (int): The number of input channels. + out_channels (int): The number of output channels. Returns: x_out (torch.Tensor): Feature map of Attention Refinement Module. """ def __init__(self, - in_channel, + in_channels, out_channel, conv_cfg=None, norm_cfg=dict(type='BN'), @@ -96,7 +97,7 @@ def __init__(self, init_cfg=None): super(AttentionRefinementModule, self).__init__(init_cfg=init_cfg) self.conv_layer = ConvModule( - in_channels=in_channel, + in_channels=in_channels, out_channels=out_channel, kernel_size=3, stride=1, @@ -118,7 +119,7 @@ def __init__(self, def forward(self, x): x = self.conv_layer(x) x_atten = self.atten_conv_layer(x) - x_out = torch.mul(x, x_atten) + x_out = x * x_atten return x_out @@ -126,16 +127,18 @@ class ContextPath(BaseModule): """Context Path to provide sufficient receptive field. Args: - backbone_cfg:(dict | None): Config of backbone of + backbone_cfg:(dict): Config of backbone of Context Path. - context_channels (Tuple[int]): Size of channel numbers of - various modules in Context Path. + context_channels (Tuple[int]): The number of channel numbers + of various modules in Context Path. Default: (128, 256, 512). align_corners (bool, optional): The align_corners argument of resize operation. Default: False. Returns: - [x_16_up, x_32_up] (List[torch.Tensor]): List of two feature - maps for Feature Fusion Module and Auxiliary Head. + x_16_up, x_32_up (torch.Tensor, torch.Tensor): Two feature maps + undergoing upsampling from 1/16 and 1/32 downsampling + feature maps. These two feature maps are used for Feature + Fusion Module and Auxiliary Head. """ def __init__(self, @@ -198,7 +201,7 @@ def forward(self, x): x_16_up = resize(input=x_16_sum, size=x_8.shape[2:], mode='nearest') x_16_up = self.conv_head16(x_16_up) - return [x_16_up, x_32_up] + return x_16_up, x_32_up class FeatureFusionModule(BaseModule): @@ -206,8 +209,8 @@ class FeatureFusionModule(BaseModule): and high level output feature of Context Path. Args: - in_channels (int): Number of input channels. - out_channels (int): Number of output channels. + in_channels (int): The number of input channels. + out_channels (int): The number of output channels. Returns: x_out (torch.Tensor): Feature map of Feature Fusion Module. """ @@ -230,8 +233,6 @@ def __init__(self, norm_cfg=norm_cfg, act_cfg=act_cfg) self.gap = nn.AdaptiveAvgPool2d((1, 1)) - # use conv-bn instead of 2 layer mlp, - # so that tensorrt 7.2.3.4 can work for fp16 self.conv_atten = nn.Sequential( ConvModule( in_channels=out_channels, @@ -250,7 +251,7 @@ def forward(self, x_sp, x_cp): x_atten = self.gap(x_fuse) # TODO: No BN and more 1x1 conv in paper. x_atten = self.conv_atten(x_atten) - x_atten = torch.mul(x_fuse, x_atten) + x_atten = x_fuse * x_atten x_out = x_atten + x_fuse return x_out @@ -264,9 +265,10 @@ class BiSeNetV1(BaseModule): Segmentation `_. Args: - in_channel(int): Channel of input image. Default: 3. - backbone_cfg:(dict | None): Config of backbone of + backbone_cfg:(dict): Config of backbone of Context Path. + in_channels(int): The number of channels of input + image. Default: 3. spatial_channels (Tuple[int]): Size of channel numbers of various layers in Spatial Path. Default: (64, 64, 64, 128). @@ -278,36 +280,47 @@ class BiSeNetV1(BaseModule): align_corners (bool, optional): The align_corners argument of resize operation in Bilateral Guided Aggregation Layer. Default: False. + out_channels(int): The number of channels of output. + It must be the same with `in_channels` of decode_head. + Default: 256. """ def __init__(self, - in_channel=3, - backbone_cfg=None, + backbone_cfg, + in_channels=3, spatial_channels=(64, 64, 64, 128), context_channels=(128, 256, 512), out_indices=(0, 1, 2), align_corners=False, + out_channels=256, conv_cfg=None, norm_cfg=dict(type='BN', requires_grad=True), act_cfg=dict(type='ReLU'), init_cfg=None): super(BiSeNetV1, self).__init__(init_cfg=init_cfg) + if len(spatial_channels) != 4: + raise AssertionError('Length of input channels of Spatial \ + Path must be 4!') + if len(context_channels) != 3: + raise AssertionError('Length of input channels of Context \ + Path must be 3!') self.out_indices = out_indices self.align_corners = align_corners self.context_path = ContextPath(backbone_cfg, context_channels, self.align_corners) - self.spatial_path = SpatialPath(spatial_channels, in_channel) - self.ffm = FeatureFusionModule(256, 256) + self.spatial_path = SpatialPath(spatial_channels, in_channels) + self.ffm = FeatureFusionModule(context_channels[1], out_channels) self.conv_cfg = conv_cfg self.norm_cfg = norm_cfg self.act_cfg = act_cfg def forward(self, x): + # stole refactoring code from Coin Cheung, thanks x_context8, x_context16 = self.context_path(x) x_spatial = self.spatial_path(x) x_fuse = self.ffm(x_spatial, x_context8) - outs = [x_fuse] + [x_context8, x_context16] + outs = [x_fuse, x_context8, x_context16] outs = [outs[i] for i in self.out_indices] return tuple(outs) diff --git a/tests/test_models/test_backbones/test_bisenetv1.py b/tests/test_models/test_backbones/test_bisenetv1.py index e6ff2f90c8..591388e71a 100644 --- a/tests/test_models/test_backbones/test_bisenetv1.py +++ b/tests/test_models/test_backbones/test_bisenetv1.py @@ -1,4 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. +import pytest import torch from mmseg.models.backbones import BiSeNetV1 @@ -17,7 +18,7 @@ def test_bisenetv1_backbone(): norm_eval=False, style='pytorch', contract_dilation=True) - model = BiSeNetV1(in_channel=3, backbone_cfg=backbone_cfg) + model = BiSeNetV1(in_channels=3, backbone_cfg=backbone_cfg) model.init_weights() model.train() batch_size = 2 @@ -37,3 +38,17 @@ def test_bisenetv1_backbone(): imgs = torch.randn(batch_size, 3, 952, 527) feat = model(imgs) assert len(feat) == 3 + + with pytest.raises(AssertionError): + # BiSeNetV1 spatial path channel constraints. + BiSeNetV1( + backbone_cfg=backbone_cfg, + in_channels=3, + spatial_channels=(64, 64, 64)) + + with pytest.raises(AssertionError): + # BiSeNetV1 context path constraints. + BiSeNetV1( + backbone_cfg=backbone_cfg, + in_channels=3, + context_channels=(128, 256, 512, 1024)) From 4c504f8edacb90d395ba8c68657250775fa5a4ae Mon Sep 17 00:00:00 2001 From: MengzhangLI Date: Fri, 17 Sep 2021 21:17:09 +0800 Subject: [PATCH 04/11] Fix assertion bug --- mmseg/models/backbones/bisenetv1.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/mmseg/models/backbones/bisenetv1.py b/mmseg/models/backbones/bisenetv1.py index 2776c6aaed..519eed46da 100644 --- a/mmseg/models/backbones/bisenetv1.py +++ b/mmseg/models/backbones/bisenetv1.py @@ -249,7 +249,7 @@ def forward(self, x_sp, x_cp): x_concat = torch.cat([x_sp, x_cp], dim=1) x_fuse = self.conv1(x_concat) x_atten = self.gap(x_fuse) - # TODO: No BN and more 1x1 conv in paper. + # Note: No BN and more 1x1 conv in paper. x_atten = self.conv_atten(x_atten) x_atten = x_fuse * x_atten x_out = x_atten + x_fuse @@ -299,12 +299,12 @@ def __init__(self, init_cfg=None): super(BiSeNetV1, self).__init__(init_cfg=init_cfg) - if len(spatial_channels) != 4: - raise AssertionError('Length of input channels of Spatial \ - Path must be 4!') - if len(context_channels) != 3: - raise AssertionError('Length of input channels of Context \ - Path must be 3!') + assert len(spatial_channels) == 4, 'Length of input channels \ + of Spatial Path must be 4!' + + assert len(context_channels) == 3, 'Length of input channels \ + of Context Path must be 3!' + self.out_indices = out_indices self.align_corners = align_corners self.context_path = ContextPath(backbone_cfg, context_channels, From 8e8bae9b5d19954328029d588f48e7b0cabc118c Mon Sep 17 00:00:00 2001 From: MengzhangLI Date: Sat, 18 Sep 2021 01:24:57 +0800 Subject: [PATCH 05/11] Adding Assert --- configs/bisenetv1/README.md | 29 +++++++++++++ configs/bisenetv1/bisenetv1.yml | 43 +++++++++++++++++++ ...1_r18-d32_4x8_1024x1024_160k_cityscapes.py | 11 +++++ model-index.yml | 1 + 4 files changed, 84 insertions(+) create mode 100644 configs/bisenetv1/README.md create mode 100644 configs/bisenetv1/bisenetv1.yml create mode 100644 configs/bisenetv1/bisenetv1_r18-d32_4x8_1024x1024_160k_cityscapes.py diff --git a/configs/bisenetv1/README.md b/configs/bisenetv1/README.md new file mode 100644 index 0000000000..08b723d685 --- /dev/null +++ b/configs/bisenetv1/README.md @@ -0,0 +1,29 @@ +# BiSeNet: Bilateral Segmentation Network for Real-time Semantic Segmentation + +## Introduction + + + +```latex +@inproceedings{yu2018bisenet, + title={Bisenet: Bilateral segmentation network for real-time semantic segmentation}, + author={Yu, Changqian and Wang, Jingbo and Peng, Chao and Gao, Changxin and Yu, Gang and Sang, Nong}, + booktitle={Proceedings of the European conference on computer vision (ECCV)}, + pages={325--341}, + year={2018} +} +``` + +## Results and models + +### Cityscapes + +| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | mIoU | mIoU(ms+flip) | config | download | +| --------- | --------- | --------- | ------: | -------- | -------------- | ----: | ------------- | --------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| BiSeNetV1 (4x4) | R-18-D32 | 1024x1024 | 160000 | 3.3 | 31.77 | 74.37 | 76.91 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fast_scnn/fast_scnn_lr0.12_8x4_160k_cityscapes/fast_scnn_lr0.12_8x4_160k_cityscapes_20210630_164853-0cec9937.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/fast_scnn/fast_scnn_lr0.12_8x4_160k_cityscapes/fast_scnn_lr0.12_8x4_160k_cityscapes_20210630_164853.log.json) | +| BiSeNetV1 (4x8) | R-18-D32 | 1024x1024 | 160000 | 3.3 | - | 75.16 | 77.24 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv1/bisenetv1_r18-d32_4x8_1024x1024_160k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fast_scnn/fast_scnn_lr0.12_8x4_160k_cityscapes/fast_scnn_lr0.12_8x4_160k_cityscapes_20210630_164853-0cec9937.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/fast_scnn/fast_scnn_lr0.12_8x4_160k_cityscapes/fast_scnn_lr0.12_8x4_160k_cityscapes_20210630_164853.log.json) | + +Note: + +- `4x4`: Using 4 GPUs with 4 samples per GPU in training. +- `4x8`: Using 4 GPUs with 8 samples per GPU in training. diff --git a/configs/bisenetv1/bisenetv1.yml b/configs/bisenetv1/bisenetv1.yml new file mode 100644 index 0000000000..94f2e26bff --- /dev/null +++ b/configs/bisenetv1/bisenetv1.yml @@ -0,0 +1,43 @@ +Collections: +- Metadata: + Training Data: + - Cityscapes + Name: bisenetv1 +Models: +- Config: configs/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes.py + In Collection: bisenetv1 + Metadata: + backbone: R-18-D32 + crop size: (1024,1024) + inference time (ms/im): + - backend: PyTorch + batch size: 1 + hardware: V100 + mode: FP32 + resolution: (1024,1024) + value: 31.48 + lr schd: 160000 + memory (GB): 3.3 + Name: bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes + Results: + Dataset: Cityscapes + Metrics: + mIoU: 74.37 + mIoU(ms+flip): 76.91 + Task: Semantic Segmentation + Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fast_scnn/fast_scnn_lr0.12_8x4_160k_cityscapes/fast_scnn_lr0.12_8x4_160k_cityscapes_20210630_164853-0cec9937.pth +- Config: configs/bisenetv1/bisenetv1_r18-d32_4x8_1024x1024_160k_cityscapes.py + In Collection: bisenetv1 + Metadata: + backbone: R-18-D32 + crop size: (1024,1024) + lr schd: 160000 + memory (GB): 3.3 + Name: bisenetv1_r18-d32_4x8_1024x1024_160k_cityscapes + Results: + Dataset: Cityscapes + Metrics: + mIoU: 75.16 + mIoU(ms+flip): 77.24 + Task: Semantic Segmentation + Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fast_scnn/fast_scnn_lr0.12_8x4_160k_cityscapes/fast_scnn_lr0.12_8x4_160k_cityscapes_20210630_164853-0cec9937.pth diff --git a/configs/bisenetv1/bisenetv1_r18-d32_4x8_1024x1024_160k_cityscapes.py b/configs/bisenetv1/bisenetv1_r18-d32_4x8_1024x1024_160k_cityscapes.py new file mode 100644 index 0000000000..a495b7190f --- /dev/null +++ b/configs/bisenetv1/bisenetv1_r18-d32_4x8_1024x1024_160k_cityscapes.py @@ -0,0 +1,11 @@ +_base_ = [ + '../_base_/models/bisenetv1_r18-d32.py', + '../_base_/datasets/cityscapes_1024x1024.py', + '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py' +] +lr_config = dict(warmup='linear', warmup_iters=1000) +optimizer = dict(lr=0.025) +data = dict( + samples_per_gpu=8, + workers_per_gpu=8, +) diff --git a/model-index.yml b/model-index.yml index d08ad33178..e6c0782275 100644 --- a/model-index.yml +++ b/model-index.yml @@ -1,6 +1,7 @@ Import: - configs/ann/ann.yml - configs/apcnet/apcnet.yml +- configs/bisenetv1/bisenetv1.yml - configs/ccnet/ccnet.yml - configs/cgnet/cgnet.yml - configs/danet/danet.yml From c331fa0b456cb72a16f0a618f32c73bd72f0aea2 Mon Sep 17 00:00:00 2001 From: MengzhangLI Date: Wed, 22 Sep 2021 17:34:01 +0800 Subject: [PATCH 06/11] Adding Unittest --- mmseg/models/backbones/bisenetv1.py | 8 ++- .../test_backbones/test_bisenetv1.py | 55 +++++++++++++++++++ 2 files changed, 62 insertions(+), 1 deletion(-) diff --git a/mmseg/models/backbones/bisenetv1.py b/mmseg/models/backbones/bisenetv1.py index 519eed46da..3ac3fd8f4d 100644 --- a/mmseg/models/backbones/bisenetv1.py +++ b/mmseg/models/backbones/bisenetv1.py @@ -30,6 +30,9 @@ def __init__(self, act_cfg=dict(type='ReLU'), init_cfg=None): super(SpatialPath, self).__init__(init_cfg=init_cfg) + assert len(num_channels) == 4, 'Length of input channels \ + of Spatial Path must be 4!' + self.layers = [] for i in range(len(num_channels)): layer_name = f'layer{i + 1}' @@ -150,6 +153,9 @@ def __init__(self, act_cfg=dict(type='ReLU'), init_cfg=None): super(ContextPath, self).__init__(init_cfg=init_cfg) + assert len(context_channels) == 3, 'Length of input channels \ + of Context Path must be 3!' + self.backbone = build_backbone(backbone_cfg) self.align_corners = align_corners @@ -267,7 +273,7 @@ class BiSeNetV1(BaseModule): Args: backbone_cfg:(dict): Config of backbone of Context Path. - in_channels(int): The number of channels of input + in_channels (int): The number of channels of input image. Default: 3. spatial_channels (Tuple[int]): Size of channel numbers of various layers in Spatial Path. diff --git a/tests/test_models/test_backbones/test_bisenetv1.py b/tests/test_models/test_backbones/test_bisenetv1.py index 591388e71a..3b5a71e56d 100644 --- a/tests/test_models/test_backbones/test_bisenetv1.py +++ b/tests/test_models/test_backbones/test_bisenetv1.py @@ -3,6 +3,9 @@ import torch from mmseg.models.backbones import BiSeNetV1 +from mmseg.models.backbones.bisenetv1 import (AttentionRefinementModule, + ContextPath, FeatureFusionModule, + SpatialPath) def test_bisenetv1_backbone(): @@ -52,3 +55,55 @@ def test_bisenetv1_backbone(): backbone_cfg=backbone_cfg, in_channels=3, context_channels=(128, 256, 512, 1024)) + + +def test_bisenetv1_spatial_path(): + with pytest.raises(AssertionError): + # BiSeNetV1 spatial path channel constraints. + SpatialPath(num_channels=(64, 64, 64), in_channels=3) + + +def test_bisenetv1_context_path(): + backbone_cfg = dict( + type='ResNet', + in_channels=3, + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + dilations=(1, 1, 1, 1), + strides=(1, 2, 2, 2), + norm_eval=False, + style='pytorch', + contract_dilation=True) + + with pytest.raises(AssertionError): + # BiSeNetV1 context path constraints. + ContextPath( + backbone_cfg=backbone_cfg, context_channels=(128, 256, 512, 1024)) + + +def test_bisenetv1_attention_refinement_module(): + x_arm = AttentionRefinementModule(512, 128) + assert x_arm.conv_layer.in_channels == 512 + assert x_arm.conv_layer.out_channels == 128 + assert x_arm.conv_layer.kernel_size == (3, 3) + x = torch.randn(2, 512, 32, 64) + x_out = x_arm(x) + assert x_out.shape == torch.Size([2, 128, 32, 64]) + + +def test_bisenetv1_feature_fusion_module(): + ffm = FeatureFusionModule(256, 512) + assert ffm.conv1.in_channels == 256 + assert ffm.conv1.out_channels == 512 + assert ffm.conv1.kernel_size == (1, 1) + assert ffm.gap.output_size == (1, 1) + assert ffm.conv_atten[0].in_channels == 512 + assert ffm.conv_atten[0].out_channels == 512 + assert ffm.conv_atten[0].kernel_size == (1, 1) + + ffm = FeatureFusionModule(256, 256) + x1 = torch.randn(2, 128, 128, 256) + x2 = torch.randn(2, 128, 128, 256) + x_out = ffm(x1, x2) + assert x_out.shape == torch.Size([2, 256, 128, 256]) From ce3149367cfb7cea7f4c88d89a59e5fb3070dc28 Mon Sep 17 00:00:00 2001 From: MengzhangLI Date: Wed, 22 Sep 2021 22:38:59 +0800 Subject: [PATCH 07/11] Fixing typo --- mmseg/models/backbones/bisenetv1.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mmseg/models/backbones/bisenetv1.py b/mmseg/models/backbones/bisenetv1.py index 3ac3fd8f4d..4beb7b394d 100644 --- a/mmseg/models/backbones/bisenetv1.py +++ b/mmseg/models/backbones/bisenetv1.py @@ -13,18 +13,18 @@ class SpatialPath(BaseModule): and encode affluent spatial information. Args: + in_channels(int): The number of channels of input + image. Default: 3. num_channels (Tuple[int]): The number of channels of each layers in Spatial Path. Default: (64, 64, 64, 128). - in_channels(int): The number of channels of input - image. Default: 3. Returns: x (torch.Tensor): Feature map for Feature Fusion Module. """ def __init__(self, - num_channels=(64, 64, 64, 128), in_channels=3, + num_channels=(64, 64, 64, 128), conv_cfg=None, norm_cfg=dict(type='BN'), act_cfg=dict(type='ReLU'), @@ -315,7 +315,7 @@ def __init__(self, self.align_corners = align_corners self.context_path = ContextPath(backbone_cfg, context_channels, self.align_corners) - self.spatial_path = SpatialPath(spatial_channels, in_channels) + self.spatial_path = SpatialPath(in_channels, spatial_channels) self.ffm = FeatureFusionModule(context_channels[1], out_channels) self.conv_cfg = conv_cfg self.norm_cfg = norm_cfg From 013c57daa6c9c1acc9a616e257130591e917759a Mon Sep 17 00:00:00 2001 From: MengzhangLI Date: Mon, 27 Sep 2021 21:15:46 +0800 Subject: [PATCH 08/11] Uploading models & logs --- configs/bisenetv1/README.md | 9 +- configs/bisenetv1/bisenetv1.yml | 85 +++++++++++++++++-- ...in1k-pre_4x4_1024x1024_160k_cityscapes.py} | 9 +- ..._in1k-pre_4x8_1024x1024_160k_cityscapes.py | 5 ++ ...1_r50-d32_4x4_1024x1024_160k_cityscapes.py | 48 +++++++++++ ..._in1k-pre_4x4_1024x1024_160k_cityscapes.py | 7 ++ 6 files changed, 152 insertions(+), 11 deletions(-) rename configs/bisenetv1/{bisenetv1_r18-d32_4x8_1024x1024_160k_cityscapes.py => bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py} (56%) create mode 100644 configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes.py create mode 100644 configs/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes.py create mode 100644 configs/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py diff --git a/configs/bisenetv1/README.md b/configs/bisenetv1/README.md index 08b723d685..051842a69b 100644 --- a/configs/bisenetv1/README.md +++ b/configs/bisenetv1/README.md @@ -20,10 +20,13 @@ | Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | mIoU | mIoU(ms+flip) | config | download | | --------- | --------- | --------- | ------: | -------- | -------------- | ----: | ------------- | --------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| BiSeNetV1 (4x4) | R-18-D32 | 1024x1024 | 160000 | 3.3 | 31.77 | 74.37 | 76.91 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fast_scnn/fast_scnn_lr0.12_8x4_160k_cityscapes/fast_scnn_lr0.12_8x4_160k_cityscapes_20210630_164853-0cec9937.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/fast_scnn/fast_scnn_lr0.12_8x4_160k_cityscapes/fast_scnn_lr0.12_8x4_160k_cityscapes_20210630_164853.log.json) | -| BiSeNetV1 (4x8) | R-18-D32 | 1024x1024 | 160000 | 3.3 | - | 75.16 | 77.24 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv1/bisenetv1_r18-d32_4x8_1024x1024_160k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fast_scnn/fast_scnn_lr0.12_8x4_160k_cityscapes/fast_scnn_lr0.12_8x4_160k_cityscapes_20210630_164853-0cec9937.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/fast_scnn/fast_scnn_lr0.12_8x4_160k_cityscapes/fast_scnn_lr0.12_8x4_160k_cityscapes_20210630_164853.log.json) | +| BiSeNetV1 (ResNet18, train from scratch) | R-18-D32 | 1024x1024 | 160000 | 5.69 | 31.77 | 74.44 | 77.05 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes_20210922_172239-c55e78e2.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes_20210922_172239.log.json) | +| BiSeNetV1 (ResNet18) | R-18-D32 | 1024x1024 | 160000 | 5.69 | 31.77 | 74.37 | 76.91 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210905_220251-8ba80eff.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210905_220251.log.json) | +| BiSeNetV1 (ResNet18, 4x8) | R-18-D32 | 1024x1024 | 160000 | 11.17 | 31.77 | 75.16 | 77.24 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes_20210905_220322-bb8db75f.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes_20210905_220322.log.json) | +| BiSeNetV1 (ResNet50, train from scratch) | R-50-D32 | 1024x1024 | 160000 | 3.3 | 7.71 | 76.92 | 78.87 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes_20210923_222639-7b28a2a6.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes_20210923_222639.log.json) | +| BiSeNetV1 (ResNet50) | R-50-D32 | 1024x1024 | 160000 | 15.39 | 7.71 | 77.68 | 79.57 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210917_234628-8b304447.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210917_234628.log.json) | Note: -- `4x4`: Using 4 GPUs with 4 samples per GPU in training. - `4x8`: Using 4 GPUs with 8 samples per GPU in training. +- Default setting is 4 GPUs with 4 samples per GPU in training. diff --git a/configs/bisenetv1/bisenetv1.yml b/configs/bisenetv1/bisenetv1.yml index 94f2e26bff..6ce193a9a9 100644 --- a/configs/bisenetv1/bisenetv1.yml +++ b/configs/bisenetv1/bisenetv1.yml @@ -17,27 +17,100 @@ Models: resolution: (1024,1024) value: 31.48 lr schd: 160000 - memory (GB): 3.3 + memory (GB): 5.69 Name: bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes + Results: + Dataset: Cityscapes + Metrics: + mIoU: 74.44 + mIoU(ms+flip): 77.05 + Task: Semantic Segmentation + Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes_20210922_172239-c55e78e2.pth +- Config: configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py + In Collection: bisenetv1 + Metadata: + backbone: R-18-D32 + crop size: (1024,1024) + inference time (ms/im): + - backend: PyTorch + batch size: 1 + hardware: V100 + mode: FP32 + resolution: (1024,1024) + value: 31.48 + lr schd: 160000 + memory (GB): 5.69 + Name: bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes Results: Dataset: Cityscapes Metrics: mIoU: 74.37 mIoU(ms+flip): 76.91 Task: Semantic Segmentation - Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fast_scnn/fast_scnn_lr0.12_8x4_160k_cityscapes/fast_scnn_lr0.12_8x4_160k_cityscapes_20210630_164853-0cec9937.pth -- Config: configs/bisenetv1/bisenetv1_r18-d32_4x8_1024x1024_160k_cityscapes.py + Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210905_220251-8ba80eff.pth +- Config: configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes.py In Collection: bisenetv1 Metadata: backbone: R-18-D32 crop size: (1024,1024) + inference time (ms/im): + - backend: PyTorch + batch size: 1 + hardware: V100 + mode: FP32 + resolution: (1024,1024) + value: 31.48 lr schd: 160000 - memory (GB): 3.3 - Name: bisenetv1_r18-d32_4x8_1024x1024_160k_cityscapes + memory (GB): 11.17 + Name: bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes Results: Dataset: Cityscapes Metrics: mIoU: 75.16 mIoU(ms+flip): 77.24 Task: Semantic Segmentation - Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fast_scnn/fast_scnn_lr0.12_8x4_160k_cityscapes/fast_scnn_lr0.12_8x4_160k_cityscapes_20210630_164853-0cec9937.pth + Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes_20210905_220322-bb8db75f.pth +- Config: configs/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes.py + In Collection: bisenetv1 + Metadata: + backbone: R-50-D32 + crop size: (1024,1024) + inference time (ms/im): + - backend: PyTorch + batch size: 1 + hardware: V100 + mode: FP32 + resolution: (1024,1024) + value: 129.7 + lr schd: 160000 + memory (GB): 3.3 + Name: bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes + Results: + Dataset: Cityscapes + Metrics: + mIoU: 76.92 + mIoU(ms+flip): 78.87 + Task: Semantic Segmentation + Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes_20210923_222639-7b28a2a6.pth +- Config: configs/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py + In Collection: bisenetv1 + Metadata: + backbone: R-50-D32 + crop size: (1024,1024) + inference time (ms/im): + - backend: PyTorch + batch size: 1 + hardware: V100 + mode: FP32 + resolution: (1024,1024) + value: 129.7 + lr schd: 160000 + memory (GB): 15.39 + Name: bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes + Results: + Dataset: Cityscapes + Metrics: + mIoU: 77.68 + mIoU(ms+flip): 79.57 + Task: Semantic Segmentation + Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210917_234628-8b304447.pth diff --git a/configs/bisenetv1/bisenetv1_r18-d32_4x8_1024x1024_160k_cityscapes.py b/configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py similarity index 56% rename from configs/bisenetv1/bisenetv1_r18-d32_4x8_1024x1024_160k_cityscapes.py rename to configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py index a495b7190f..ef061a16bd 100644 --- a/configs/bisenetv1/bisenetv1_r18-d32_4x8_1024x1024_160k_cityscapes.py +++ b/configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py @@ -3,9 +3,14 @@ '../_base_/datasets/cityscapes_1024x1024.py', '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py' ] +model = dict( + backbone=dict( + backbone_cfg=dict( + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnet18_v1c')))) lr_config = dict(warmup='linear', warmup_iters=1000) optimizer = dict(lr=0.025) data = dict( - samples_per_gpu=8, - workers_per_gpu=8, + samples_per_gpu=4, + workers_per_gpu=4, ) diff --git a/configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes.py b/configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes.py new file mode 100644 index 0000000000..ea27ef0a11 --- /dev/null +++ b/configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes.py @@ -0,0 +1,5 @@ +_base_ = './bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py' +data = dict( + samples_per_gpu=8, + workers_per_gpu=8, +) diff --git a/configs/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes.py b/configs/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes.py new file mode 100644 index 0000000000..72e3024aef --- /dev/null +++ b/configs/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes.py @@ -0,0 +1,48 @@ +_base_ = [ + '../_base_/models/bisenetv1_r18-d32.py', + '../_base_/datasets/cityscapes_1024x1024.py', + '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py' +] +norm_cfg = dict(type='SyncBN', requires_grad=True) +model = dict( + type='EncoderDecoder', + backbone=dict( + type='BiSeNetV1', + context_channels=(512, 1024, 2048), + spatial_channels=(256, 256, 256, 512), + out_channels=1024, + backbone_cfg=dict( + init_cfg=dict(type='Pretrained', checkpoint='open-mmlab://resnet50_v1c'), + type='ResNet', + depth=50)), + decode_head=dict( + type='FCNHead', + in_channels=1024, + in_index=0, + channels=1024), + auxiliary_head=[ + dict( + type='FCNHead', + in_channels=512, + channels=256, + num_convs=1, + num_classes=19, + in_index=1, + norm_cfg=norm_cfg, + concat_input=False), + dict( + type='FCNHead', + in_channels=512, + channels=256, + num_convs=1, + num_classes=19, + in_index=2, + norm_cfg=norm_cfg, + concat_input=False), + ]) +lr_config = dict(warmup='linear', warmup_iters=1000) +optimizer = dict(lr=0.05) +data = dict( + samples_per_gpu=4, + workers_per_gpu=4, +) diff --git a/configs/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py b/configs/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py new file mode 100644 index 0000000000..5625a76c08 --- /dev/null +++ b/configs/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py @@ -0,0 +1,7 @@ +_base_ = './bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes.py' +model = dict( + type='EncoderDecoder', + backbone=dict( + backbone_cfg=dict( + init_cfg=dict( + type='Pretrained', checkpoint='open-mmlab://resnet50_v1c')))) From ac1b2f73f469ac990e3b547a96a36d03af4eb2ae Mon Sep 17 00:00:00 2001 From: MengzhangLI Date: Tue, 28 Sep 2021 13:57:52 +0800 Subject: [PATCH 09/11] Fixing unittest error --- .../test_backbones/test_bisenetv1.py | 38 +++++++++---------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/tests/test_models/test_backbones/test_bisenetv1.py b/tests/test_models/test_backbones/test_bisenetv1.py index 3b5a71e56d..8e1571d6fb 100644 --- a/tests/test_models/test_backbones/test_bisenetv1.py +++ b/tests/test_models/test_backbones/test_bisenetv1.py @@ -25,20 +25,20 @@ def test_bisenetv1_backbone(): model.init_weights() model.train() batch_size = 2 - imgs = torch.randn(batch_size, 3, 512, 1024) + imgs = torch.randn(batch_size, 3, 256, 512) feat = model(imgs) assert len(feat) == 3 # output for segment Head - assert feat[0].shape == torch.Size([batch_size, 256, 64, 128]) + assert feat[0].shape == torch.Size([batch_size, 256, 32, 64]) # for auxiliary head 1 - assert feat[1].shape == torch.Size([batch_size, 128, 64, 128]) + assert feat[1].shape == torch.Size([batch_size, 128, 32, 64]) # for auxiliary head 2 - assert feat[2].shape == torch.Size([batch_size, 128, 32, 64]) + assert feat[2].shape == torch.Size([batch_size, 128, 16, 32]) # Test input with rare shape batch_size = 2 - imgs = torch.randn(batch_size, 3, 952, 527) + imgs = torch.randn(batch_size, 3, 527, 279) feat = model(imgs) assert len(feat) == 3 @@ -83,27 +83,27 @@ def test_bisenetv1_context_path(): def test_bisenetv1_attention_refinement_module(): - x_arm = AttentionRefinementModule(512, 128) - assert x_arm.conv_layer.in_channels == 512 - assert x_arm.conv_layer.out_channels == 128 + x_arm = AttentionRefinementModule(256, 64) + assert x_arm.conv_layer.in_channels == 256 + assert x_arm.conv_layer.out_channels == 64 assert x_arm.conv_layer.kernel_size == (3, 3) - x = torch.randn(2, 512, 32, 64) + x = torch.randn(2, 256, 32, 64) x_out = x_arm(x) - assert x_out.shape == torch.Size([2, 128, 32, 64]) + assert x_out.shape == torch.Size([2, 64, 32, 64]) def test_bisenetv1_feature_fusion_module(): - ffm = FeatureFusionModule(256, 512) - assert ffm.conv1.in_channels == 256 - assert ffm.conv1.out_channels == 512 + ffm = FeatureFusionModule(128, 256) + assert ffm.conv1.in_channels == 128 + assert ffm.conv1.out_channels == 256 assert ffm.conv1.kernel_size == (1, 1) assert ffm.gap.output_size == (1, 1) - assert ffm.conv_atten[0].in_channels == 512 - assert ffm.conv_atten[0].out_channels == 512 + assert ffm.conv_atten[0].in_channels == 256 + assert ffm.conv_atten[0].out_channels == 256 assert ffm.conv_atten[0].kernel_size == (1, 1) - ffm = FeatureFusionModule(256, 256) - x1 = torch.randn(2, 128, 128, 256) - x2 = torch.randn(2, 128, 128, 256) + ffm = FeatureFusionModule(128, 128) + x1 = torch.randn(2, 64, 64, 128) + x2 = torch.randn(2, 64, 64, 128) x_out = ffm(x1, x2) - assert x_out.shape == torch.Size([2, 256, 128, 256]) + assert x_out.shape == torch.Size([2, 128, 64, 128]) From f28204d3582e082a1fcbc91522b24d60a9ea4682 Mon Sep 17 00:00:00 2001 From: MengzhangLI Date: Tue, 28 Sep 2021 20:19:40 +0800 Subject: [PATCH 10/11] changing README.md --- configs/bisenetv1/README.md | 20 ++++++-- configs/bisenetv1/bisenetv1.yml | 83 ++++++++++++++++++--------------- 2 files changed, 61 insertions(+), 42 deletions(-) diff --git a/configs/bisenetv1/README.md b/configs/bisenetv1/README.md index 051842a69b..cd0ebd526b 100644 --- a/configs/bisenetv1/README.md +++ b/configs/bisenetv1/README.md @@ -4,6 +4,13 @@ +Official Repo + +Code Snippet + +
+BiSeNetV1 (ECCV'2018) + ```latex @inproceedings{yu2018bisenet, title={Bisenet: Bilateral segmentation network for real-time semantic segmentation}, @@ -14,19 +21,22 @@ } ``` +
+ ## Results and models ### Cityscapes | Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | mIoU | mIoU(ms+flip) | config | download | | --------- | --------- | --------- | ------: | -------- | -------------- | ----: | ------------- | --------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| BiSeNetV1 (ResNet18, train from scratch) | R-18-D32 | 1024x1024 | 160000 | 5.69 | 31.77 | 74.44 | 77.05 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes_20210922_172239-c55e78e2.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes_20210922_172239.log.json) | -| BiSeNetV1 (ResNet18) | R-18-D32 | 1024x1024 | 160000 | 5.69 | 31.77 | 74.37 | 76.91 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210905_220251-8ba80eff.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210905_220251.log.json) | -| BiSeNetV1 (ResNet18, 4x8) | R-18-D32 | 1024x1024 | 160000 | 11.17 | 31.77 | 75.16 | 77.24 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes_20210905_220322-bb8db75f.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes_20210905_220322.log.json) | -| BiSeNetV1 (ResNet50, train from scratch) | R-50-D32 | 1024x1024 | 160000 | 3.3 | 7.71 | 76.92 | 78.87 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes_20210923_222639-7b28a2a6.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes_20210923_222639.log.json) | -| BiSeNetV1 (ResNet50) | R-50-D32 | 1024x1024 | 160000 | 15.39 | 7.71 | 77.68 | 79.57 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210917_234628-8b304447.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210917_234628.log.json) | +| BiSeNetV1 (No Pretrain) | R-18-D32 | 1024x1024 | 160000 | 5.69 | 31.77 | 74.44 | 77.05 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes_20210922_172239-c55e78e2.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes_20210922_172239.log.json) | +| BiSeNetV1| R-18-D32 | 1024x1024 | 160000 | 5.69 | 31.77 | 74.37 | 76.91 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210905_220251-8ba80eff.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210905_220251.log.json) | +| BiSeNetV1 (4x8) | R-18-D32 | 1024x1024 | 160000 | 11.17 | 31.77 | 75.16 | 77.24 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes_20210905_220322-bb8db75f.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes_20210905_220322.log.json) | +| BiSeNetV1 (No Pretrain) | R-50-D32 | 1024x1024 | 160000 | 3.3 | 7.71 | 76.92 | 78.87 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes_20210923_222639-7b28a2a6.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes_20210923_222639.log.json) | +| BiSeNetV1 | R-50-D32 | 1024x1024 | 160000 | 15.39 | 7.71 | 77.68 | 79.57 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210917_234628-8b304447.pth) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210917_234628.log.json) | Note: - `4x8`: Using 4 GPUs with 8 samples per GPU in training. - Default setting is 4 GPUs with 4 samples per GPU in training. +- `No Pretrain` means the model is trained from scratch. diff --git a/configs/bisenetv1/bisenetv1.yml b/configs/bisenetv1/bisenetv1.yml index 6ce193a9a9..6de872b863 100644 --- a/configs/bisenetv1/bisenetv1.yml +++ b/configs/bisenetv1/bisenetv1.yml @@ -1,116 +1,125 @@ Collections: -- Metadata: +- Name: bisenetv1 + Metadata: Training Data: - Cityscapes - Name: bisenetv1 + Paper: + URL: https://arxiv.org/abs/1808.00897 + Title: 'BiSeNet: Bilateral Segmentation Network for Real-time Semantic Segmentation' + README: configs/bisenetv1/README.md + Code: + URL: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/backbones/bisenetv1.py#L266 + Version: v0.18.0 + Converted From: + Code: https://github.com/ycszen/TorchSeg/tree/master/model/bisenet Models: -- Config: configs/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes.py +- Name: bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes In Collection: bisenetv1 Metadata: backbone: R-18-D32 crop size: (1024,1024) + lr schd: 160000 inference time (ms/im): - - backend: PyTorch - batch size: 1 + - value: 31.48 hardware: V100 + backend: PyTorch + batch size: 1 mode: FP32 resolution: (1024,1024) - value: 31.48 - lr schd: 160000 memory (GB): 5.69 - Name: bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes Results: + - Task: Semantic Segmentation Dataset: Cityscapes Metrics: mIoU: 74.44 mIoU(ms+flip): 77.05 - Task: Semantic Segmentation + Config: configs/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes.py Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes_20210922_172239-c55e78e2.pth -- Config: configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py +- Name: bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes In Collection: bisenetv1 Metadata: backbone: R-18-D32 crop size: (1024,1024) + lr schd: 160000 inference time (ms/im): - - backend: PyTorch - batch size: 1 + - value: 31.48 hardware: V100 + backend: PyTorch + batch size: 1 mode: FP32 resolution: (1024,1024) - value: 31.48 - lr schd: 160000 memory (GB): 5.69 - Name: bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes Results: + - Task: Semantic Segmentation Dataset: Cityscapes Metrics: mIoU: 74.37 mIoU(ms+flip): 76.91 - Task: Semantic Segmentation + Config: configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210905_220251-8ba80eff.pth -- Config: configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes.py +- Name: bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes In Collection: bisenetv1 Metadata: backbone: R-18-D32 crop size: (1024,1024) + lr schd: 160000 inference time (ms/im): - - backend: PyTorch - batch size: 1 + - value: 31.48 hardware: V100 + backend: PyTorch + batch size: 1 mode: FP32 resolution: (1024,1024) - value: 31.48 - lr schd: 160000 memory (GB): 11.17 - Name: bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes Results: + - Task: Semantic Segmentation Dataset: Cityscapes Metrics: mIoU: 75.16 mIoU(ms+flip): 77.24 - Task: Semantic Segmentation + Config: configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes.py Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes_20210905_220322-bb8db75f.pth -- Config: configs/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes.py +- Name: bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes In Collection: bisenetv1 Metadata: backbone: R-50-D32 crop size: (1024,1024) + lr schd: 160000 inference time (ms/im): - - backend: PyTorch - batch size: 1 + - value: 129.7 hardware: V100 + backend: PyTorch + batch size: 1 mode: FP32 resolution: (1024,1024) - value: 129.7 - lr schd: 160000 memory (GB): 3.3 - Name: bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes Results: + - Task: Semantic Segmentation Dataset: Cityscapes Metrics: mIoU: 76.92 mIoU(ms+flip): 78.87 - Task: Semantic Segmentation + Config: configs/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes.py Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes_20210923_222639-7b28a2a6.pth -- Config: configs/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py +- Name: bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes In Collection: bisenetv1 Metadata: backbone: R-50-D32 crop size: (1024,1024) + lr schd: 160000 inference time (ms/im): - - backend: PyTorch - batch size: 1 + - value: 129.7 hardware: V100 + backend: PyTorch + batch size: 1 mode: FP32 resolution: (1024,1024) - value: 129.7 - lr schd: 160000 memory (GB): 15.39 - Name: bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes Results: + - Task: Semantic Segmentation Dataset: Cityscapes Metrics: mIoU: 77.68 mIoU(ms+flip): 79.57 - Task: Semantic Segmentation + Config: configs/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210917_234628-8b304447.pth From 79b00c08622a4a0b4a48b9e3a299eabe6424f98a Mon Sep 17 00:00:00 2001 From: MengzhangLI Date: Tue, 28 Sep 2021 20:21:25 +0800 Subject: [PATCH 11/11] changing README.md --- configs/bisenetv1/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/bisenetv1/README.md b/configs/bisenetv1/README.md index cd0ebd526b..344781068a 100644 --- a/configs/bisenetv1/README.md +++ b/configs/bisenetv1/README.md @@ -39,4 +39,4 @@ Note: - `4x8`: Using 4 GPUs with 8 samples per GPU in training. - Default setting is 4 GPUs with 4 samples per GPU in training. -- `No Pretrain` means the model is trained from scratch. +- `No Pretrain` means the model is trained from scratch.