-
Notifications
You must be signed in to change notification settings - Fork 1.2k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* add smot * lint * lint * lint * tutorial * tutorial change * fix comments
- Loading branch information
Showing
23 changed files
with
3,007 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
"""03. Multiple object tracking with pre-trained SMOT models | ||
============================================================= | ||
In this tutorial, we present a method, | ||
called `Single-Shot Multi Object Tracking (SMOT) <https://arxiv.org/abs/2010.16031>`_, to perform multi-object tracking. | ||
SMOT is a new tracking framework that converts any single-shot detector (SSD) model into an online multiple object tracker, | ||
which emphasizes simultaneously detecting and tracking of the object paths. | ||
As an example below, we directly use the SSD-Mobilenet object detector pretrained on COCO from :ref:`gluoncv-model-zoo` | ||
and perform multiple object tracking on an arbitrary video. | ||
We want to point out that, SMOT is very efficient, its runtime is close to the runtime of the chosen detector. | ||
""" | ||
|
||
###################################################################### | ||
# Predict with a SMOT model | ||
# ---------------------------- | ||
# | ||
# First, we download a video from MOT challenge website, | ||
|
||
from gluoncv import utils | ||
video_path = 'https://motchallenge.net/sequenceVideos/MOT17-02-FRCNN-raw.webm' | ||
im_video = utils.download(video_path) | ||
|
||
################################################################ | ||
# Then you can simply use our provided script under `/scripts/tracking/smot/demo.py` to obtain the multi-object tracking result. | ||
# | ||
# :: | ||
# | ||
# python demo.py MOT17-02-FRCNN-raw.webm | ||
# | ||
# | ||
################################################################ | ||
# You can see the tracking results below. Here, we only track persons, | ||
# but you can track other objects as long as your detector is trained on that category. | ||
# | ||
# .. raw:: html | ||
# | ||
# <div align="center"> | ||
# <img src="../../_static/smot_demo.gif"> | ||
# </div> | ||
# | ||
# <br> | ||
|
||
################################################################ | ||
# Our model is able to track multiple persons even when they are partially occluded. | ||
# Try it on your own video and see the results! |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -39,3 +39,4 @@ | |
from .siamrpn import * | ||
from .fastscnn import * | ||
from .monodepthv2 import * | ||
from .smot import * |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
# pylint: disable=wildcard-import | ||
""" | ||
SMOT: Single-Shot Multi Object Tracking | ||
https://arxiv.org/abs/2010.16031 | ||
""" | ||
from __future__ import absolute_import | ||
from .smot_tracker import * | ||
from .tracktors import * |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
# pylint: disable=unused-import | ||
"""Anchor box generator for SSD detector.""" | ||
from __future__ import absolute_import | ||
|
||
import numpy as np | ||
from mxnet import gluon | ||
|
||
|
||
class SSDAnchorGenerator(gluon.HybridBlock): | ||
"""Bounding box anchor generator for Single-shot Object Detection. | ||
Parameters | ||
---------- | ||
index : int | ||
Index of this generator in SSD models, this is required for naming. | ||
sizes : iterable of floats | ||
Sizes of anchor boxes. | ||
ratios : iterable of floats | ||
Aspect ratios of anchor boxes. | ||
step : int or float | ||
Step size of anchor boxes. | ||
alloc_size : tuple of int | ||
Allocate size for the anchor boxes as (H, W). | ||
Usually we generate enough anchors for large feature map, e.g. 128x128. | ||
Later in inference we can have variable input sizes, | ||
at which time we can crop corresponding anchors from this large | ||
anchor map so we can skip re-generating anchors for each input. | ||
offsets : tuple of float | ||
Center offsets of anchor boxes as (h, w) in range(0, 1). | ||
""" | ||
def __init__(self, index, im_size, sizes, ratios, step, alloc_size=(128, 128), | ||
offsets=(0.5, 0.5), clip=False, **kwargs): | ||
super(SSDAnchorGenerator, self).__init__(**kwargs) | ||
assert len(im_size) == 2 | ||
self._im_size = im_size | ||
self._clip = clip | ||
self._sizes = sizes | ||
self._ratios = ratios | ||
anchors = self._generate_anchors(self._sizes, self._ratios, step, alloc_size, offsets) | ||
self._num_anchors = np.size(anchors) / 4 | ||
self.anchors = self.params.get_constant('anchor_%d'%(index), anchors) | ||
|
||
def _generate_anchors(self, sizes, ratios, step, alloc_size, offsets): | ||
# pylint: disable=unused-argument,too-many-function-args | ||
"""Generate anchors for once. Anchors are stored with (center_x, center_y, w, h) format.""" | ||
anchors = [] | ||
for i in range(alloc_size[0]): | ||
for j in range(alloc_size[1]): | ||
cy = (i + offsets[0]) * step | ||
cx = (j + offsets[1]) * step | ||
|
||
for sz in self._sizes: | ||
for r in ratios: | ||
sr = np.sqrt(r) | ||
w = sz * sr | ||
h = sz / sr | ||
anchors.append([cx, cy, w, h]) | ||
return np.array(anchors).reshape(1, 1, alloc_size[0], alloc_size[1], -1) | ||
|
||
@property | ||
def num_depth(self): | ||
"""Number of anchors at each pixel.""" | ||
return len(self._sizes) * len(self._ratios) | ||
|
||
@property | ||
def num_anchors(self): | ||
"""Number of anchors at each pixel.""" | ||
return self._num_anchors | ||
|
||
# pylint: disable=arguments-differ | ||
def hybrid_forward(self, F, x, anchors): | ||
a = F.slice_like(anchors, x * 0, axes=(2, 3)) | ||
a = a.reshape((1, -1, 4)) | ||
if self._clip: | ||
cx, cy, cw, ch = a.split(axis=-1, num_outputs=4) | ||
H, W = self._im_size | ||
a = F.concat(*[cx.clip(0, W), cy.clip(0, H), cw.clip(0, W), ch.clip(0, H)], dim=-1) | ||
return a.reshape((1, -1, 4)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
""" | ||
MXNet implementation of SMOT: Single-Shot Multi Object Tracking | ||
https://arxiv.org/abs/2010.16031 | ||
""" | ||
from mxnet import gluon | ||
from gluoncv.nn.bbox import BBoxCenterToCorner | ||
|
||
|
||
class NormalizedLandmarkCenterDecoder(gluon.HybridBlock): | ||
""" | ||
Decode bounding boxes training target with normalized center offsets. | ||
This decoder must cooperate with NormalizedBoxCenterEncoder of same `stds` | ||
in order to get properly reconstructed bounding boxes. | ||
Returned bounding boxes are using corner type: `x_{min}, y_{min}, x_{max}, y_{max}`. | ||
Parameters | ||
---------- | ||
stds : array-like of size 4 | ||
Std value to be divided from encoded values, default is (0.1, 0.1, 0.2, 0.2). | ||
means : array-like of size 4 | ||
Mean value to be subtracted from encoded values, default is (0., 0., 0., 0.). | ||
clip: float, default is None | ||
If given, bounding box target will be clipped to this value. | ||
""" | ||
|
||
def __init__(self, stds=(0.1, 0.1, 0.2, 0.2), means=(0., 0., 0., 0.), | ||
convert_anchor=True): | ||
super(NormalizedLandmarkCenterDecoder, self).__init__() | ||
assert len(stds) == 4, "Box Encoder requires 4 std values." | ||
self._stds = stds | ||
self._means = means | ||
if convert_anchor: | ||
self.center_to_conner = BBoxCenterToCorner(split=True) | ||
else: | ||
self.center_to_conner = None | ||
|
||
def hybrid_forward(self, F, x, anchors): | ||
"""center decoder forward""" | ||
if self.center_to_conner is not None: | ||
a = self.center_to_conner(anchors) | ||
else: | ||
a = anchors.split(axis=-1, num_outputs=4) | ||
ld = F.split(x, axis=-1, num_outputs=10) | ||
|
||
x0 = F.broadcast_add(F.broadcast_mul(ld[0] * self._stds[0] + self._means[0], a[2] - a[0]), a[0]) | ||
y0 = F.broadcast_add(F.broadcast_mul(ld[1] * self._stds[1] + self._means[1], a[3] - a[1]), a[1]) | ||
x1 = F.broadcast_add(F.broadcast_mul(ld[2] * self._stds[0] + self._means[0], a[2] - a[0]), a[0]) | ||
y1 = F.broadcast_add(F.broadcast_mul(ld[3] * self._stds[1] + self._means[1], a[3] - a[1]), a[1]) | ||
x2 = F.broadcast_add(F.broadcast_mul(ld[4] * self._stds[0] + self._means[0], a[2] - a[0]), a[0]) | ||
y2 = F.broadcast_add(F.broadcast_mul(ld[5] * self._stds[1] + self._means[1], a[3] - a[1]), a[1]) | ||
x3 = F.broadcast_add(F.broadcast_mul(ld[6] * self._stds[0] + self._means[0], a[2] - a[0]), a[0]) | ||
y3 = F.broadcast_add(F.broadcast_mul(ld[7] * self._stds[1] + self._means[1], a[3] - a[1]), a[1]) | ||
x4 = F.broadcast_add(F.broadcast_mul(ld[8] * self._stds[0] + self._means[0], a[2] - a[0]), a[0]) | ||
y4 = F.broadcast_add(F.broadcast_mul(ld[9] * self._stds[1] + self._means[1], a[3] - a[1]), a[1]) | ||
|
||
return F.concat(x0, y0, x1, y1, x2, y2, x3, y3, x4, y4, dim=-1) | ||
|
||
|
||
class GeneralNormalizedKeyPointsDecoder(gluon.HybridBlock): | ||
""" | ||
Decode bounding boxes training target with normalized center offsets. | ||
This decoder must cooperate with NormalizedBoxCenterEncoder of same `stds` | ||
in order to get properly reconstructed bounding boxes. | ||
Returned bounding boxes are using corner type: `x_{min}, y_{min}, x_{max}, y_{max}`. | ||
Parameters | ||
---------- | ||
stds : array-like of size 4 | ||
Std value to be divided from encoded values, default is (0.1, 0.1, 0.2, 0.2). | ||
means : array-like of size 4 | ||
Mean value to be subtracted from encoded values, default is (0., 0., 0., 0.). | ||
clip: float, default is None | ||
If given, bounding box target will be clipped to this value. | ||
""" | ||
|
||
def __init__(self, num_points, stds=(0.2, 0.2), means=(0.5, 0.2), | ||
convert_anchor=True): | ||
super(GeneralNormalizedKeyPointsDecoder, self).__init__() | ||
assert len(stds) == 2, "Box Encoder requires 4 std values." | ||
self._stds = stds | ||
self._means = means | ||
self._size = num_points * 2 | ||
if convert_anchor: | ||
self.center_to_conner = BBoxCenterToCorner(split=True) | ||
else: | ||
self.center_to_conner = None | ||
|
||
def hybrid_forward(self, F, x, anchors): | ||
"""key point decoder forward""" | ||
if self.center_to_conner is not None: | ||
a = self.center_to_conner(anchors) | ||
else: | ||
a = anchors.split(axis=-1, num_outputs=4) | ||
ld = F.split(x, axis=-1, num_outputs=self._size) | ||
|
||
outputs = [] | ||
for i in range(0, self._size, 2): | ||
x = F.broadcast_add(F.broadcast_mul(ld[i] * self._stds[0] + self._means[0], a[2] - a[0]), a[0]) | ||
y = F.broadcast_add(F.broadcast_mul(ld[i+1] * self._stds[1] + self._means[1], a[3] - a[1]), a[1]) | ||
outputs.extend([x, y]) | ||
|
||
return F.concat(*outputs, dim=-1) |
Oops, something went wrong.