From 8be3fd186080d157c1a61e52f95aa15bb8b94292 Mon Sep 17 00:00:00 2001 From: Karan Jariwala Date: Mon, 10 Aug 2020 12:08:20 -0700 Subject: [PATCH 1/2] Sharding of data for Horovod distributed training --- gluoncv/data/pascal_voc/detection.py | 11 +++++++++++ scripts/detection/ssd/train_ssd.py | 10 ++++++++-- scripts/detection/yolo/train_yolo3.py | 17 ++++++++++++++--- 3 files changed, 33 insertions(+), 5 deletions(-) diff --git a/gluoncv/data/pascal_voc/detection.py b/gluoncv/data/pascal_voc/detection.py index 28b73472c7..febfc2c240 100644 --- a/gluoncv/data/pascal_voc/detection.py +++ b/gluoncv/data/pascal_voc/detection.py @@ -54,6 +54,7 @@ def __init__(self, root=os.path.join('~', '.mxnet', 'datasets', 'voc'), transform=None, index_map=None, preload_label=True): super(VOCDetection, self).__init__(root) self._im_shapes = {} + self._im_aspect_ratios = None self._root = os.path.expanduser(root) self._transform = transform self._splits = splits @@ -151,6 +152,16 @@ def _preload_labels(self): logging.debug("Preloading %s labels into memory...", str(self)) return [self._load_label(idx) for idx in range(len(self))] + def get_im_aspect_ratio(self): + """Return the aspect ratio of each image in the order of the raw data.""" + if self._im_aspect_ratios is not None: + return self._im_aspect_ratios + self._im_aspect_ratios = [None] * len(self._im_shapes) + for i, im_shape in self._im_shapes.items(): + self._im_aspect_ratios[i] = 1.0 * im_shape[0] / im_shape[1] + + return self._im_aspect_ratios + class CustomVOCDetection(VOCDetection): """Custom Pascal VOC detection Dataset. diff --git a/scripts/detection/ssd/train_ssd.py b/scripts/detection/ssd/train_ssd.py index 28da787089..833c71eac5 100644 --- a/scripts/detection/ssd/train_ssd.py +++ b/scripts/detection/ssd/train_ssd.py @@ -126,9 +126,15 @@ def get_dataloader(net, train_dataset, val_dataset, data_shape, batch_size, num_ _, _, anchors = net(mx.nd.zeros((1, 3, height, width), ctx)) anchors = anchors.as_in_context(mx.cpu()) batchify_fn = Tuple(Stack(), Stack(), Stack()) # stack image, cls_targets, box_targets + train_sampler = \ + gcv.nn.sampler.SplitSortedBucketSampler(train_dataset.get_im_aspect_ratio(), + batch_size, + num_parts=hvd.size() if args.horovod else 1, + part_index=hvd.rank() if args.horovod else 0, + shuffle=True) train_loader = gluon.data.DataLoader( train_dataset.transform(SSDDefaultTrainTransform(width, height, anchors)), - batch_size, True, batchify_fn=batchify_fn, last_batch='rollover', num_workers=num_workers) + batch_sampler=train_sampler, batchify_fn=batchify_fn, num_workers=num_workers) val_batchify_fn = Tuple(Stack(), Pad(pad_val=-1)) val_loader = gluon.data.DataLoader( val_dataset.transform(SSDDefaultValTransform(width, height)), @@ -349,7 +355,7 @@ def train(net, train_data, val_data, eval_metric, ctx, args): name2, loss2 = smoothl1_metric.get() logger.info('[Epoch {}] Training cost: {:.3f}, {}={:.3f}, {}={:.3f}'.format( epoch, (time.time()-tic), name1, loss1, name2, loss2)) - if (epoch % args.val_interval == 0) or (args.save_interval and epoch % args.save_interval == 0): + if ((epoch + 1) % args.val_interval == 0) or (args.save_interval and epoch % args.save_interval == 0): # consider reduce the frequency of validation to save time map_name, mean_ap = validate(net, val_data, ctx, eval_metric) val_msg = '\n'.join(['{}={}'.format(k, v) for k, v in zip(map_name, mean_ap)]) diff --git a/scripts/detection/yolo/train_yolo3.py b/scripts/detection/yolo/train_yolo3.py index 95684eaa83..6088a2ec2e 100644 --- a/scripts/detection/yolo/train_yolo3.py +++ b/scripts/detection/yolo/train_yolo3.py @@ -130,15 +130,26 @@ def get_dataloader(net, train_dataset, val_dataset, data_shape, batch_size, num_ """Get dataloader.""" width, height = data_shape, data_shape batchify_fn = Tuple(*([Stack() for _ in range(6)] + [Pad(axis=0, pad_val=-1) for _ in range(1)])) # stack image, all targets generated + + if args.mixup: + im_aspect_ratio = train_dataset._dataset.get_im_aspect_ratio() + else: + im_aspect_ratio = train_dataset.get_im_aspect_ratio() + train_sampler = \ + gcv.nn.sampler.SplitSortedBucketSampler(im_aspect_ratio, + batch_size, + num_parts=hvd.size() if args.horovod else 1, + part_index=hvd.rank() if args.horovod else 0, + shuffle=True) if args.no_random_shape: train_loader = gluon.data.DataLoader( train_dataset.transform(YOLO3DefaultTrainTransform(width, height, net, mixup=args.mixup)), - batch_size, True, batchify_fn=batchify_fn, last_batch='rollover', num_workers=num_workers) + batch_sampler=train_sampler, batchify_fn=batchify_fn, num_workers=num_workers) else: transform_fns = [YOLO3DefaultTrainTransform(x * 32, x * 32, net, mixup=args.mixup) for x in range(10, 20)] train_loader = RandomTransformDataLoader( - transform_fns, train_dataset, batch_size=batch_size, interval=10, last_batch='rollover', - shuffle=True, batchify_fn=batchify_fn, num_workers=num_workers) + transform_fns, train_dataset, interval=10, + batch_sampler=train_sampler, batchify_fn=batchify_fn, num_workers=num_workers) val_batchify_fn = Tuple(Stack(), Pad(pad_val=-1)) val_loader = gluon.data.DataLoader( val_dataset.transform(YOLO3DefaultValTransform(width, height)), From 3018577df119831baef2240accba55c00f956193 Mon Sep 17 00:00:00 2001 From: Karan Jariwala Date: Wed, 12 Aug 2020 21:51:50 -0700 Subject: [PATCH 2/2] Fixed throughput calculation based on log_interval --- scripts/detection/ssd/train_ssd.py | 8 +++++--- scripts/detection/yolo/train_yolo3.py | 6 ++++-- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/scripts/detection/ssd/train_ssd.py b/scripts/detection/ssd/train_ssd.py index 833c71eac5..45dd7d4cd8 100644 --- a/scripts/detection/ssd/train_ssd.py +++ b/scripts/detection/ssd/train_ssd.py @@ -347,15 +347,17 @@ def train(net, train_data, val_data, eval_metric, ctx, args): name1, loss1 = ce_metric.get() name2, loss2 = smoothl1_metric.get() logger.info('[Epoch {}][Batch {}], Speed: {:.3f} samples/sec, {}={:.3f}, {}={:.3f}'.format( - epoch, i, args.batch_size/(time.time()-btic), name1, loss1, name2, loss2)) - btic = time.time() + epoch, i, args.log_interval * args.batch_size / (time.time() - btic), + name1, loss1, name2, loss2)) + btic = time.time() if (not args.horovod or hvd.rank() == 0): name1, loss1 = ce_metric.get() name2, loss2 = smoothl1_metric.get() logger.info('[Epoch {}] Training cost: {:.3f}, {}={:.3f}, {}={:.3f}'.format( epoch, (time.time()-tic), name1, loss1, name2, loss2)) - if ((epoch + 1) % args.val_interval == 0) or (args.save_interval and epoch % args.save_interval == 0): + if ((epoch + 1) % args.val_interval == 0) or \ + (args.save_interval and (epoch + 1) % args.save_interval == 0): # consider reduce the frequency of validation to save time map_name, mean_ap = validate(net, val_data, ctx, eval_metric) val_msg = '\n'.join(['{}={}'.format(k, v) for k, v in zip(map_name, mean_ap)]) diff --git a/scripts/detection/yolo/train_yolo3.py b/scripts/detection/yolo/train_yolo3.py index 6088a2ec2e..84540f7a4c 100644 --- a/scripts/detection/yolo/train_yolo3.py +++ b/scripts/detection/yolo/train_yolo3.py @@ -313,8 +313,10 @@ def train(net, train_data, val_data, eval_metric, ctx, args): name3, loss3 = scale_metrics.get() name4, loss4 = cls_metrics.get() logger.info('[Epoch {}][Batch {}], LR: {:.2E}, Speed: {:.3f} samples/sec, {}={:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}'.format( - epoch, i, trainer.learning_rate, args.batch_size/(time.time()-btic), name1, loss1, name2, loss2, name3, loss3, name4, loss4)) - btic = time.time() + epoch, i, trainer.learning_rate, + args.log_interval * args.batch_size / (time.time() - btic), + name1, loss1, name2, loss2, name3, loss3, name4, loss4)) + btic = time.time() if (not args.horovod or hvd.rank() == 0): name1, loss1 = obj_metrics.get()