Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP][DO NOT MERGE] Matrixnet #1328

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions gluoncv/data/transforms/presets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@
from . import imagenet
from . import simple_pose
from . import segmentation
from . import matrix_net
196 changes: 196 additions & 0 deletions gluoncv/data/transforms/presets/matrix_net.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
"""Transforms described in https://arxiv.org/abs/1904.07850 and https://arxiv.org/abs/2001.03194."""
# pylint: disable=too-many-function-args
from __future__ import absolute_import
import numpy as np
import mxnet as mx
from .. import bbox as tbbox
from .. import image as timage
from .. import experimental
from ....utils.filesystem import try_import_cv2

__all__ = ['MatrixNetDefaultTrainTransform', 'MatrixNetDefaultValTransform',
'get_post_transform']

class MatrixNetDefaultTrainTransform(object):
"""Default MatrixNet training transform which includes tons of image augmentations.

Parameters
----------
width : int
Image width.
height : int
Image height.
num_class : int
Number of categories
layers_range : list of list of number(list of number)
Represents the same meaning as that of MatrixNet
scale_factor : int, default is 4
The downsampling scale factor between input image and output heatmap
mean : array-like of size 3
Mean pixel values to be subtracted from image tensor. Default is [0.485, 0.456, 0.406].
std : array-like of size 3
Standard deviation to be divided from image. Default is [0.229, 0.224, 0.225].
"""
def __init__(self, width, height, num_class, layers_range, scale_factor=4, mean=(0.485, 0.456, 0.406),
std=(0.229, 0.224, 0.225), **kwargs):
self._kwargs = kwargs
self._width = width
self._height = height
self._num_class = num_class
self._layers_range = layers_range
self._scale_factor = scale_factor
self._mean = np.array(mean, dtype=np.float32).reshape(1, 1, 3)
self._std = np.array(std, dtype=np.float32).reshape(1, 1, 3)
self._data_rng = np.random.RandomState(123)
self._eig_val = np.array([0.2141788, 0.01817699, 0.00341571],
dtype=np.float32)
self._eig_vec = np.array([
[-0.58752847, -0.69563484, 0.41340352],
[-0.5832747, 0.00994535, -0.81221408],
[-0.56089297, 0.71832671, 0.41158938]
], dtype=np.float32)

from ....model_zoo.matrix_net.target_generator import MatrixNetTargetGenerator
self._target_generator = MatrixNetTargetGenerator(
num_class, width, height, self._layers_range)

def __call__(self, src, label):
"""Apply transform to training image/label."""
# random color jittering
img = src
bbox = label

# random horizontal flip
h, w, _ = img.shape
img, flips = timage.random_flip(img, px=0.5)
bbox = tbbox.flip(bbox, (w, h), flip_x=flips[0])

cv2 = try_import_cv2()
input_h, input_w = self._height, self._width
s = max(h, w) * 1.0
c = np.array([w / 2., h / 2.], dtype=np.float32)
sf = 0.4
w_border = _get_border(128, img.shape[1])
h_border = _get_border(128, img.shape[0])
c[0] = np.random.randint(low=w_border, high=img.shape[1] - w_border)
c[1] = np.random.randint(low=h_border, high=img.shape[0] - h_border)
s = s * np.clip(np.random.randn()*sf + 1, 1 - sf, 1 + sf)
trans_input = tbbox.get_affine_transform(c, s, 0, [input_w, input_h])
inp = cv2.warpAffine(img.asnumpy(), trans_input, (input_w, input_h), flags=cv2.INTER_LINEAR)

trans_output = tbbox.get_affine_transform(c, s, 0, [input_w, input_h])
for i in range(bbox.shape[0]):
bbox[i, :2] = tbbox.affine_transform(bbox[i, :2], trans_output)
bbox[i, 2:4] = tbbox.affine_transform(bbox[i, 2:4], trans_output)
bbox[:, :2] = np.clip(bbox[:, :2], 0, input_w - 1)
bbox[:, 2:4] = np.clip(bbox[:, 2:4], 0, input_h - 1)
img = inp

# to tensor
img = img.astype(np.float32) / 255.
experimental.image.np_random_color_distort(img, data_rng=self._data_rng)
img = (img - self._mean) / self._std
img = img.transpose(2, 0, 1).astype(np.float32)
img = mx.nd.array(img)

# generate training target so cpu workers can help reduce the workload on gpu
gt_bboxes = bbox[:, :4]
gt_ids = bbox[:, 4:5]
heatmaps, wh_targets, wh_masks, center_regs, center_reg_masks = self._target_generator(
gt_bboxes, gt_ids)
results = []
results.append(img)
for heatmap in heatmaps:
results.append(heatmap)
for wh_target in wh_targets:
results.append(wh_target)
for wh_mask in wh_masks:
results.append(wh_mask)
for center_reg in center_regs:
results.append(center_reg)
for center_reg_mask in center_reg_masks:
results.append(center_reg_mask)
return tuple(results)


class MatrixNetDefaultValTransform(object):
"""Default MatrixNet validation transform.

Parameters
----------
width : int
Image width.
height : int
Image height.
mean : array-like of size 3
Mean pixel values to be subtracted from image tensor. Default is [0.485, 0.456, 0.406].
std : array-like of size 3
Standard deviation to be divided from image. Default is [0.229, 0.224, 0.225].

"""
def __init__(self, width, height, mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)):
self._width = width
self._height = height
self._mean = np.array(mean, dtype=np.float32).reshape(1, 1, 3)
self._std = np.array(std, dtype=np.float32).reshape(1, 1, 3)

def __call__(self, src, label):
"""Apply transform to validation image/label."""
# resize
img, bbox = src.asnumpy(), label
cv2 = try_import_cv2()
input_h, input_w = self._height, self._width
h, w, _ = src.shape
s = max(h, w) * 1.0
c = np.array([w / 2., h / 2.], dtype=np.float32)
trans_input = tbbox.get_affine_transform(c, s, 0, [input_w, input_h])
inp = cv2.warpAffine(img, trans_input, (input_w, input_h), flags=cv2.INTER_LINEAR)
output_w = input_w
output_h = input_h
trans_output = tbbox.get_affine_transform(c, s, 0, [output_w, output_h])
for i in range(bbox.shape[0]):
bbox[i, :2] = tbbox.affine_transform(bbox[i, :2], trans_output)
bbox[i, 2:4] = tbbox.affine_transform(bbox[i, 2:4], trans_output)
bbox[:, :2] = np.clip(bbox[:, :2], 0, output_w - 1)
bbox[:, 2:4] = np.clip(bbox[:, 2:4], 0, output_h - 1)
img = inp

# to tensor
img = img.astype(np.float32) / 255.
img = (img - self._mean) / self._std
img = img.transpose(2, 0, 1).astype(np.float32)
img = mx.nd.array(img)
return img, bbox.astype(img.dtype)

def get_post_transform(orig_w, orig_h, out_w, out_h):
"""Get the post prediction affine transforms. This will be used to adjust the prediction results
according to original coco image resolutions.

Parameters
----------
orig_w : int
Original width of the image.
orig_h : int
Original height of the image.
out_w : int
Width of the output image after prediction.
out_h : int
Height of the output image after prediction.

Returns
-------
numpy.ndarray
Affine transform matrix 3x2.

"""
s = max(orig_w, orig_h) * 1.0
c = np.array([orig_w / 2., orig_h / 2.], dtype=np.float32)
trans_output = tbbox.get_affine_transform(c, s, 0, [out_w, out_h], inv=True)
return trans_output

def _get_border(border, size):
"""Get the border size of the image"""
i = 1
while size - border // i <= border // i:
i *= 2
return border // i
5 changes: 5 additions & 0 deletions gluoncv/model_zoo/matrix_net/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
"""MatrixNet"""
# pylint: disable=wildcard-import
from __future__ import absolute_import

from .matrix_net import *
Loading